PyPI - sglang - Versions diffs - 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl - Mend

sglang 0.4.7.post1py3-none-any.whl → 0.4.8.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (123) hide show

sglang/bench_one_batch.py +8 -6
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +14 -3
sglang/srt/custom_op.py +11 -1
sglang/srt/disaggregation/base/conn.py +2 -0
sglang/srt/disaggregation/decode.py +22 -28
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/conn.py +301 -64
sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
sglang/srt/disaggregation/nixl/conn.py +94 -46
sglang/srt/disaggregation/prefill.py +20 -15
sglang/srt/disaggregation/utils.py +47 -18
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +27 -31
sglang/srt/entrypoints/http_server.py +149 -79
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +115 -34
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +897 -0
sglang/srt/entrypoints/openai/serving_completions.py +425 -0
sglang/srt/entrypoints/openai/serving_embedding.py +170 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +28 -3
sglang/srt/layers/attention/aiter_backend.py +5 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
sglang/srt/layers/attention/flashattention_backend.py +43 -23
sglang/srt/layers/attention/flashinfer_backend.py +9 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
sglang/srt/layers/attention/flashmla_backend.py +5 -2
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +19 -11
sglang/srt/layers/communicator.py +5 -5
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +44 -2
sglang/srt/layers/linear.py +18 -1
sglang/srt/layers/logits_processor.py +14 -5
sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
sglang/srt/layers/moe/ep_moe/layer.py +286 -13
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +19 -2
sglang/srt/layers/moe/fused_moe_native.py +7 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +13 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +148 -26
sglang/srt/layers/moe/topk.py +117 -4
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/fp8_utils.py +5 -4
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/rotary_embedding.py +144 -12
sglang/srt/layers/sampler.py +1 -1
sglang/srt/layers/vocab_parallel_embedding.py +14 -1
sglang/srt/lora/lora_manager.py +173 -74
sglang/srt/lora/mem_pool.py +49 -45
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -15
sglang/srt/managers/expert_distribution.py +21 -0
sglang/srt/managers/io_struct.py +19 -14
sglang/srt/managers/multimodal_processors/base_processor.py +44 -9
sglang/srt/managers/multimodal_processors/gemma3n.py +97 -0
sglang/srt/managers/schedule_batch.py +49 -32
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +189 -68
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +11 -8
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -16
sglang/srt/mem_cache/hiradix_cache.py +34 -23
sglang/srt/mem_cache/memory_pool.py +118 -114
sglang/srt/mem_cache/radix_cache.py +20 -16
sglang/srt/model_executor/cuda_graph_runner.py +77 -46
sglang/srt/model_executor/forward_batch_info.py +18 -5
sglang/srt/model_executor/model_runner.py +27 -8
sglang/srt/model_loader/loader.py +50 -8
sglang/srt/model_loader/weight_utils.py +100 -2
sglang/srt/models/deepseek_nextn.py +35 -30
sglang/srt/models/deepseek_v2.py +255 -30
sglang/srt/models/gemma3n_audio.py +949 -0
sglang/srt/models/gemma3n_causal.py +1009 -0
sglang/srt/models/gemma3n_mm.py +511 -0
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/hunyuan.py +771 -0
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/server_args.py +51 -9
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
sglang/srt/speculative/eagle_utils.py +80 -8
sglang/srt/speculative/eagle_worker.py +124 -41
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/two_batch_overlap.py +4 -1
sglang/srt/utils.py +248 -11
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/METADATA +4 -10
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/RECORD +121 -105
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -2148
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/top_level.txt +0 -0

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 # Copyright 2023-2024 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,15 +20,17 @@ import random
 from collections import defaultdict
 from contextlib import contextmanager
 from enum import Enum, auto
-from typing import Dict, List, Optional, Set, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
 import torch
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
-from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
 from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
 # Clip the estimation of max_new_tokens for the request whose max_new_tokens is very large.
 # This can prevent the server from being too conservative.
 # Note that this only clips the estimation in the scheduler but does not change the stop
@@ -51,6 +55,9 @@ IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD = int(
 )
+IGNORE_EOS_RESERVE_TOKENS = 1
 class CacheAwarePolicy(Enum):
     """Scheduling policies that are aware of the tree cache."""
@@ -90,7 +97,7 @@ class SchedulePolicy:
     def calc_priority(self, waiting_queue: List[Req]) -> bool:
         if self.policy == CacheAgnosticPolicy.FCFS:
             # A shortcut for FCFS
-            return
+            return False
         policy = self._determine_active_policy(waiting_queue)
@@ -134,7 +141,7 @@ class SchedulePolicy:
         """
         try:
             policy_enum = CacheAwarePolicy(policy)
-            if tree_cache.disable:
+            if getattr(tree_cache, "disable", True):
                 # If tree_cache is disabled, using CacheAgnosticPolicy policy
                 return CacheAgnosticPolicy.FCFS
             return policy_enum
@@ -158,14 +165,9 @@ class SchedulePolicy:
             prefix_ids = r.adjust_max_prefix_ids()
             # NOTE: the prefix_indices must always be aligned with last_node
-            if self.enable_hierarchical_cache:
-                r.prefix_indices, r.last_node, r.last_node_global = (
-                    self.tree_cache.match_prefix(key=prefix_ids, include_evicted=True)
-                )
-            else:
-                r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
-                    rid=r.rid, key=prefix_ids
-                )
+            r.prefix_indices, r.last_node, r.last_host_node, r.host_hit_length = (
+                self.tree_cache.match_prefix(rid=r.rid, key=prefix_ids)
+            )
             # NOTE(sang): This logic is for in-batch prefix caching;
             # If there are more than 1 request that have small matching prefix from
@@ -175,7 +177,7 @@ class SchedulePolicy:
             # threshold means we cannot use in-batch prefix caching for short prefixes.
             # It is kind of common when the engine is long running (e.g., imagine the prefix "the").
             if len(r.prefix_indices) <= IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD:
-                in_batch_matching_prefixes, _ = (
+                in_batch_matching_prefixes, _, _, _ = (
                     self.waiting_queue_radix_tree.match_prefix(
                         rid=r.rid, key=prefix_ids
                     )
@@ -268,14 +270,16 @@ class AddReqResult(Enum):
 class PrefillAdder:
     def __init__(
         self,
+        page_size: int,
         tree_cache: BasePrefixCache,
-        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
         running_batch: ScheduleBatch,
         new_token_ratio: float,
         rem_input_tokens: int,
         rem_chunk_tokens: Optional[int],
         mixed_with_decode_tokens: int = 0,
     ):
+        self.page_size = page_size
         self.tree_cache = tree_cache
         self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
         self.running_batch = running_batch
@@ -292,6 +296,7 @@ class PrefillAdder:
         self.can_run_list = []
         self.new_chunked_req = None
         self.log_hit_tokens = 0
+        # TODO(lsyin): report the real input tokens excluding page alignment
         self.log_input_tokens = 0
         if running_batch is not None:
@@ -322,6 +327,9 @@ class PrefillAdder:
             - self.cur_rem_token_offset
         )
+    def ceil_paged_tokens(self, tokens: int) -> int:
+        return -(-tokens // self.page_size) * self.page_size
     def budget_state(self):
         if self.rem_total_tokens <= 0 or self.cur_rem_tokens <= 0:
             return AddReqResult.NO_TOKEN
@@ -333,9 +341,12 @@ class PrefillAdder:
         return AddReqResult.CONTINUE
-    def _prefill_one_req(
+    def _update_prefill_budget(
         self, prefix_len: int, extend_input_len: int, max_new_tokens: int
     ):
+        # TODO(lsyin): check this workaround logic, which only ensures the prefill will not out of memory, and may be too conservative
+        extend_input_len = self.ceil_paged_tokens(extend_input_len)
         self.rem_total_token_offset += extend_input_len + max_new_tokens
         self.cur_rem_token_offset += extend_input_len
         self.rem_input_tokens -= extend_input_len
@@ -350,7 +361,7 @@ class PrefillAdder:
         req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
         req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
         self.can_run_list.append(req)
-        self._prefill_one_req(
+        self._update_prefill_budget(
             0,
             req.extend_input_len,
             (
@@ -372,6 +383,12 @@ class PrefillAdder:
             self.tree_cache.dec_lock_ref(last_node)
     def add_one_req_ignore_eos(self, req: Req, has_chunked_req: bool):
+        # Early exit if no enough tokens for the input tokens
+        if self.ceil_paged_tokens(req.extend_input_len) > min(
+            self.cur_rem_tokens, self.rem_total_tokens
+        ):
+            return AddReqResult.NO_TOKEN
         def add_req_state(r, insert_sort=False):
             new_token_ratio = (
                 1.0 if r.sampling_params.ignore_eos else self.new_token_ratio
@@ -381,15 +398,17 @@ class PrefillAdder:
             )
             tokens_occupied = len(r.origin_input_ids) + len(r.output_ids)
-            if tokens_left > 0:
-                if not insert_sort:
-                    self.req_states.append((tokens_left, tokens_occupied))
-                else:
-                    i = 0
-                    for i in range(len(self.req_states)):
-                        if tokens_left <= self.req_states[i][0]:
-                            break
-                    self.req_states.insert(i, (tokens_left, tokens_occupied))
+            if tokens_left <= 0:
+                return
+            if not insert_sort:
+                self.req_states.append((tokens_left, tokens_occupied))
+            else:
+                i = 0
+                for i in range(len(self.req_states)):
+                    if tokens_left <= self.req_states[i][0]:
+                        break
+                self.req_states.insert(i, (tokens_left, tokens_occupied))
         if self.req_states is None:
             self.req_states = []
@@ -406,13 +425,11 @@ class PrefillAdder:
         cur_rem_tokens = self.cur_rem_tokens - len(req.origin_input_ids)
         tokens_freed = 0
         for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
-            decode_steps = (
-                self.req_states[i + 1][0]
-                if i + 1 < len(self.req_states)
-                else tokens_left
-            )
+            # tokens_left gives a reservative calculation as the last token is not stored
             bs = len(self.req_states) - i
-            if cur_rem_tokens + tokens_freed - decode_steps * bs <= 0:
+            min_free_tokens = cur_rem_tokens + tokens_freed - tokens_left * bs
+            # reserve tokens for corner cases
+            if min_free_tokens <= IGNORE_EOS_RESERVE_TOKENS * bs:
                 return AddReqResult.NO_TOKEN
             tokens_freed += tokens_occupied
@@ -422,7 +439,7 @@ class PrefillAdder:
         ):
             # Non-chunked prefill
             self.can_run_list.append(req)
-            self._prefill_one_req(
+            self._update_prefill_budget(
                 0,
                 req.extend_input_len,
                 min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION),
@@ -438,55 +455,52 @@ class PrefillAdder:
             req.fill_ids = req.fill_ids[:trunc_len]
             self.can_run_list.append(req)
             self.new_chunked_req = req
-            self._prefill_one_req(0, trunc_len, 0)
+            self._update_prefill_budget(0, trunc_len, 0)
         return self.budget_state()
-    def add_one_req(
-        self, req: Req, has_chunked_req: bool, enable_hierarchical_cache: bool = False
-    ):
+    def add_one_req(self, req: Req, has_chunked_req: bool):
         if req.sampling_params.ignore_eos and getattr(self.tree_cache, "disable", True):
             return self.add_one_req_ignore_eos(req, has_chunked_req)
         total_tokens = req.extend_input_len + min(
             req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION
         )
-        input_tokens = (
-            -(-req.extend_input_len // self.tree_cache.page_size)
-            * self.tree_cache.page_size
-        )
+        # adjusting the input_tokens based on host_hit_length and page_size
+        real_input_tokens = req.extend_input_len - req.host_hit_length
+        real_input_tokens = self.ceil_paged_tokens(real_input_tokens)
         prefix_len = len(req.prefix_indices)
         if total_tokens >= self.rem_total_tokens:
             return AddReqResult.NO_TOKEN
-        if input_tokens > self.rem_input_tokens and len(self.can_run_list) != 0:
+        if real_input_tokens >= self.rem_input_tokens and len(self.can_run_list) != 0:
             return AddReqResult.OTHER
         with self._lock_node(req.last_node):
-            if total_tokens > self.rem_total_tokens:
+            # self.rem_total_tokens may decrease after the lock acquisition
+            if total_tokens >= self.rem_total_tokens:
                 return AddReqResult.NO_TOKEN
-            if (
-                enable_hierarchical_cache
-                and req.last_node_global is not None
-                and req.last_node_global.evicted
-            ):
-                req.last_node, req.prefix_indices = self.tree_cache.init_load_back(
-                    req.last_node_global, req.prefix_indices
+            if req.host_hit_length > 0:
+                new_indices, req.last_node = self.tree_cache.init_load_back(
+                    req.last_host_node, req.host_hit_length
                 )
+                req.prefix_indices = torch.cat([req.prefix_indices, new_indices])
                 req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
-                input_tokens = (
-                    -(-req.extend_input_len // self.tree_cache.page_size)
-                    * self.tree_cache.page_size
-                )
                 prefix_len = len(req.prefix_indices)
+            input_tokens = self.ceil_paged_tokens(req.extend_input_len)
+            if input_tokens >= self.rem_input_tokens and len(self.can_run_list) != 0:
+                return AddReqResult.OTHER
             if self.rem_chunk_tokens is None or input_tokens <= self.rem_chunk_tokens:
                 # Non-chunked prefill
                 self.can_run_list.append(req)
                 self.tree_cache.inc_lock_ref(req.last_node)
-                self._prefill_one_req(
+                self._update_prefill_budget(
                     prefix_len,
                     input_tokens,
                     min(
@@ -496,7 +510,7 @@ class PrefillAdder:
                 )
             else:
                 # Make sure at least one page is available
-                trunc_len = self.rem_chunk_tokens - self.tree_cache.page_size + 1
+                trunc_len = self.rem_chunk_tokens - self.page_size + 1
                 if trunc_len <= 0:
                     return AddReqResult.OTHER
@@ -507,6 +521,6 @@ class PrefillAdder:
                 self.can_run_list.append(req)
                 self.new_chunked_req = req
                 self.tree_cache.inc_lock_ref(req.last_node)
-                self._prefill_one_req(prefix_len, trunc_len, 0)
+                self._update_prefill_budget(prefix_len, trunc_len, 0)
         return self.budget_state()

sglang 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl

sglang 0.4.7.post1py3-none-any.whl → 0.4.8.post1py3-none-any.whl