PyPI - sglang - Versions diffs - 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

sglang 0.4.0.post1py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

sglang/bench_offline_throughput.py +6 -6
sglang/bench_one_batch.py +1 -0
sglang/bench_serving.py +9 -1
sglang/check_env.py +140 -48
sglang/lang/backend/runtime_endpoint.py +1 -0
sglang/lang/chat_template.py +32 -0
sglang/llama3_eval.py +316 -0
sglang/srt/aio_rwlock.py +100 -0
sglang/srt/configs/model_config.py +8 -1
sglang/srt/constrained/xgrammar_backend.py +4 -1
sglang/srt/layers/attention/flashinfer_backend.py +51 -5
sglang/srt/layers/attention/triton_backend.py +16 -25
sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
sglang/srt/layers/linear.py +20 -2
sglang/srt/layers/logits_processor.py +133 -95
sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +18 -39
sglang/srt/layers/moe/fused_moe_native.py +46 -0
sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +174 -119
sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +17 -49
sglang/srt/layers/moe/topk.py +191 -0
sglang/srt/layers/quantization/__init__.py +5 -50
sglang/srt/layers/quantization/fp8.py +221 -36
sglang/srt/layers/quantization/fp8_kernel.py +278 -0
sglang/srt/layers/quantization/fp8_utils.py +90 -1
sglang/srt/layers/radix_attention.py +8 -1
sglang/srt/layers/sampler.py +27 -5
sglang/srt/layers/torchao_utils.py +31 -0
sglang/srt/managers/detokenizer_manager.py +37 -17
sglang/srt/managers/io_struct.py +39 -10
sglang/srt/managers/schedule_batch.py +54 -34
sglang/srt/managers/schedule_policy.py +64 -5
sglang/srt/managers/scheduler.py +171 -136
sglang/srt/managers/tokenizer_manager.py +184 -133
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +2 -2
sglang/srt/mem_cache/memory_pool.py +15 -8
sglang/srt/mem_cache/radix_cache.py +12 -2
sglang/srt/model_executor/cuda_graph_runner.py +25 -11
sglang/srt/model_executor/model_runner.py +28 -14
sglang/srt/model_parallel.py +66 -5
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_v2.py +67 -18
sglang/srt/models/gemma2.py +34 -0
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/granite.py +517 -0
sglang/srt/models/grok.py +73 -9
sglang/srt/models/llama.py +22 -0
sglang/srt/models/llama_classification.py +11 -23
sglang/srt/models/llama_reward.py +0 -2
sglang/srt/models/llava.py +37 -14
sglang/srt/models/mixtral.py +2 -2
sglang/srt/models/olmoe.py +1 -1
sglang/srt/models/qwen2.py +20 -0
sglang/srt/models/qwen2_moe.py +1 -1
sglang/srt/models/xverse_moe.py +1 -1
sglang/srt/openai_api/adapter.py +8 -0
sglang/srt/openai_api/protocol.py +9 -4
sglang/srt/server.py +2 -1
sglang/srt/server_args.py +19 -9
sglang/srt/utils.py +40 -54
sglang/test/test_block_fp8.py +341 -0
sglang/test/test_utils.py +3 -2
sglang/utils.py +10 -3
sglang/version.py +1 -1
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/METADATA +12 -7
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/RECORD +73 -67
sglang/srt/layers/fused_moe_patch.py +0 -133
/sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
/sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/LICENSE +0 -0
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/WHEEL +0 -0
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/top_level.txt +0 -0

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -129,6 +129,7 @@ class ImageInputs:
     image_hashes: Optional[list] = None
     image_sizes: Optional[list] = None
     image_offsets: Optional[list] = None
+    image_pad_len: Optional[list] = None
     pad_values: Optional[list] = None
     modalities: Optional[list] = None
     num_image_tokens: Optional[int] = None
@@ -181,6 +182,7 @@ class ImageInputs:
         optional_args = [
             "image_sizes",
             "image_offsets",
+            "image_pad_len",
             # "modalities", # modalities should be ["multi-images"] (one entry) even for multiple images
             "aspect_ratio_ids",
             "aspect_ratio_mask",
@@ -200,6 +202,9 @@ class Req:
         origin_input_text: str,
         origin_input_ids: Tuple[int],
         sampling_params: SamplingParams,
+        return_logprob: bool = False,
+        top_logprobs_num: int = 0,
+        stream: bool = False,
         origin_input_ids_unpadded: Optional[Tuple[int]] = None,
         lora_path: Optional[str] = None,
         input_embeds: Optional[List[List[float]]] = None,
@@ -217,10 +222,11 @@ class Req:
         self.output_ids = []  # Each decode stage's output ids
         self.fill_ids = None  # fill_ids = origin_input_ids + output_ids
         self.session_id = session_id
+        self.input_embeds = input_embeds
+        # Sampling info
         self.sampling_params = sampling_params
         self.lora_path = lora_path
-        self.input_embeds = input_embeds
         # Memory pool info
         self.req_pool_idx = None
@@ -228,8 +234,8 @@ class Req:
         # Check finish
         self.tokenizer = None
         self.finished_reason = None
-        self.stream = False
         self.to_abort = False
+        self.stream = stream
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -241,37 +247,46 @@ class Req:
         # 2: read_offset
         # 3: last token
         self.vid = 0  # version id to sync decode status with in detokenizer_manager
-        self.decoded_text = ""
         self.surr_offset = None  # Surrounding offset to defeat the cleanup algorithm
         self.read_offset = None
-        # The number of decoded tokens for token usage report. Note that
-        # this does not include the jump forward tokens.
-        self.completion_tokens_wo_jump_forward = 0
+        self.decoded_text = ""
         # For multimodal inputs
         self.image_inputs: Optional[ImageInputs] = None
         # Prefix info
         self.prefix_indices = []
+        # Tokens to run prefill. input_tokens - shared_prefix_tokens.
         self.extend_input_len = 0
         self.last_node = None
+        # Chunked prefill
         self.is_being_chunked = 0
         # For retraction
         self.is_retracted = False
         # Logprobs (arguments)
-        self.return_logprob = False
+        self.return_logprob = return_logprob
         self.logprob_start_len = 0
-        self.top_logprobs_num = 0
+        self.top_logprobs_num = top_logprobs_num
         # Logprobs (return value)
         self.normalized_prompt_logprob = None
-        self.input_token_logprobs = None
-        self.input_top_logprobs = None
-        self.output_token_logprobs = []
-        self.output_top_logprobs = []
+        self.input_token_logprobs_val = None
+        self.input_token_logprobs_idx = None
+        self.input_top_logprobs_val = None
+        self.input_top_logprobs_idx = None
+        if return_logprob:
+            self.output_token_logprobs_val = []
+            self.output_token_logprobs_idx = []
+            self.output_top_logprobs_val = []
+            self.output_top_logprobs_idx = []
+        else:
+            self.output_token_logprobs_val = self.output_token_logprobs_idx = (
+                self.output_top_logprobs_val
+            ) = self.output_top_logprobs_idx = None
         # Logprobs (internal values)
         # The tokens is prefilled but need to be considered as decode tokens
@@ -295,13 +310,14 @@ class Req:
         else:
             self.image_inputs.merge(image_inputs)
-    # whether request reached finished condition
     def finished(self) -> bool:
+        # Whether request reached finished condition
         return self.finished_reason is not None
     def init_next_round_input(self, tree_cache: Optional[BasePrefixCache] = None):
         self.fill_ids = self.origin_input_ids + self.output_ids
         if tree_cache is not None:
+            # tree cache is None if the prefix is not computed with tree cache.
             self.prefix_indices, self.last_node = tree_cache.match_prefix(
                 rid=self.rid, key=self.adjust_max_prefix_ids()
             )
@@ -454,15 +470,31 @@ class Req:
                     k = k + 1
                 else:
                     break
-            self.output_token_logprobs = self.output_token_logprobs[:k]
-            self.output_top_logprobs = self.output_top_logprobs[:k]
+            self.output_token_logprobs_val = self.output_token_logprobs_val[:k]
+            self.output_token_logprobs_idx = self.output_token_logprobs_idx[:k]
+            self.output_top_logprobs_val = self.output_top_logprobs_val[:k]
+            self.output_top_logprobs_idx = self.output_top_logprobs_idx[:k]
             self.logprob_start_len = prompt_tokens + k
             self.last_update_decode_tokens = len(self.output_ids) - k
         return True
+    def reset_for_retract(self):
+        self.prefix_indices = []
+        self.last_node = None
+        self.extend_input_len = 0
+        self.is_retracted = True
+        # For incremental logprobs
+        # TODO: Fix the `logprob_start_len`
+        self.last_update_decode_tokens = 0
+        self.logprob_start_len = 10**9
     def __repr__(self):
-        return f"rid(n={self.rid}, " f"input_ids={self.origin_input_ids}, "
+        return (
+            f"rid(n={self.rid}, "
+            f"input_ids={self.origin_input_ids}, output_ids={self.output_ids}"
+        )
 bid = 0
@@ -470,7 +502,7 @@ bid = 0
 @dataclasses.dataclass
 class ScheduleBatch:
-    """Store all inforamtion of a batch on the scheduler."""
+    """Store all information of a batch on the scheduler."""
     # Request, memory pool, and cache
     reqs: List[Req]
@@ -876,15 +908,7 @@ class ScheduleBatch:
                 )
                 residual_size = max(0, residual_size)
                 self.tree_cache.evict(residual_size, self.token_to_kv_pool.free)
-            req.prefix_indices = []
-            req.last_node = None
-            req.extend_input_len = 0
-            req.is_retracted = True
-            # For incremental logprobs
-            req.last_update_decode_tokens = 0
-            req.logprob_start_len = 10**9
+            req.reset_for_retract()
         self.filter_batch(keep_indices=sorted_indices)
@@ -1068,9 +1092,9 @@ class ScheduleBatch:
             self.top_logprobs_nums = [0] * len(self.reqs) + other.top_logprobs_nums
         self.reqs.extend(other.reqs)
-        self.return_logprob = self.return_logprob or other.return_logprob
-        self.has_stream = self.has_stream or other.has_stream
-        self.has_grammar = self.has_grammar or other.has_grammar
+        self.return_logprob |= other.return_logprob
+        self.has_stream |= other.has_stream
+        self.has_grammar |= other.has_grammar
     def get_model_worker_batch(self):
         if self.forward_mode.is_decode() or self.forward_mode.is_idle():
@@ -1097,7 +1121,6 @@ class ScheduleBatch:
             seq_lens=self.seq_lens,
             out_cache_loc=self.out_cache_loc,
             seq_lens_sum=self.seq_lens_sum,
-            req_to_token_pool_records=self.req_to_token_pool.get_write_records(),
             return_logprob=self.return_logprob,
             top_logprobs_nums=self.top_logprobs_nums,
             global_num_tokens=self.global_num_tokens,
@@ -1152,9 +1175,6 @@ class ModelWorkerBatch:
     # The sum of all sequence lengths
     seq_lens_sum: int
-    # The memory pool operation records
-    req_to_token_pool_records: Optional[List[Tuple[Tuple, torch.Tensor]]]
     # For logprob
     return_logprob: bool
     top_logprobs_nums: Optional[List[int]]

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -20,9 +20,11 @@ from contextlib import contextmanager
 from enum import Enum, auto
 from typing import Dict, List, Optional
+import torch
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
-from sglang.srt.mem_cache.radix_cache import TreeNode
+from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
 # Clip the estimation of max_new_tokens for the request whose max_new_tokens is very large.
 # This can prevent the server from being too conservative.
@@ -32,6 +34,21 @@ CLIP_MAX_NEW_TOKENS_ESTIMATION = int(
     os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", "4096")
 )
+# Threshold for in-batch prefix cache.
+# If a request has a matched prefix length (against existing cache) less than this value,
+# the scheduler runs the in-batch prefix caching check for this request.
+# If we set it to -1, it means we disable in-batch prefix caching.
+IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD = int(
+    os.environ.get("IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD", "32")
+)
+# Threshold for in-batch prefix cache.
+# If a request has a matched prefix length (within the waiting queue) larger than this value,
+# the scheduler deprioritizes this request
+IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD = int(
+    os.environ.get("IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD", "32")
+)
 class SchedulePolicy:
     def __init__(self, policy: str, tree_cache: BasePrefixCache):
@@ -42,6 +59,11 @@ class SchedulePolicy:
         self.policy = policy
         self.tree_cache = tree_cache
+        # It is used to find the matching prefix for in-batch prefix caching.
+        self.waiting_queue_radix_tree = RadixCache(
+            req_to_token_pool=None, token_to_kv_pool=None, disable=False
+        )
     def calc_priority(self, waiting_queue: List[Req]):
         if len(waiting_queue) > 128 and self.policy == "lpm":
             # Turn off the expensive prefix matching and sorting when the #queue is large.
@@ -52,17 +74,53 @@ class SchedulePolicy:
         # Compute matched prefix length
         prefix_computed = False
         if policy == "lpm" or policy == "dfs-weight":
+            # rid to deprioritize in the current run for in-batch prefix caching.
+            temporary_deprioritized = set()
+            self.waiting_queue_radix_tree.reset()
             for r in waiting_queue:
+                prefix_ids = r.adjust_max_prefix_ids()
                 # NOTE: the prefix_indices must always be aligned with last_node
                 r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
-                    rid=r.rid, key=r.adjust_max_prefix_ids()
+                    rid=r.rid, key=prefix_ids
                 )
+                # NOTE(sang): This logic is for in-batch prefix caching;
+                # If there are more than 1 request that have small matching prefix from
+                # existing cache, but all those requests share the same prefix, we prefer
+                # to schedule only one of them so that we can increase the cache hit rate.
+                # We prefer to set IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD > 0 because too small
+                # threshold means we cannot use in-batch prefix caching for short prefixes.
+                # It is kind of common when the engine is long running (e.g., imagine the prefix "the").
+                if len(r.prefix_indices) <= IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD:
+                    in_batch_matching_prefixes, _ = (
+                        self.waiting_queue_radix_tree.match_prefix(
+                            rid=r.rid, key=prefix_ids
+                        )
+                    )
+                    if (
+                        len(in_batch_matching_prefixes)
+                        >= IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD
+                    ):
+                        temporary_deprioritized.add(r.rid)
+                    else:
+                        # Insert with a dummy key
+                        self.waiting_queue_radix_tree.insert(
+                            prefix_ids, torch.empty(len(prefix_ids), dtype=torch.bool)
+                        )
             prefix_computed = True
         if policy == "lpm":
             # Longest Prefix Match
-            waiting_queue.sort(key=lambda x: -len(x.prefix_indices))
+            waiting_queue.sort(
+                key=lambda r: (
+                    -len(r.prefix_indices)
+                    if r.rid not in temporary_deprioritized
+                    else float("inf")
+                )
+            )
         elif policy == "fcfs":
             # first come first serve
             pass
@@ -72,6 +130,7 @@ class SchedulePolicy:
         elif policy == "random":
             random.shuffle(waiting_queue)
         elif policy == "dfs-weight":
+            # Experimental policy based on custom weights
             last_node_to_reqs = defaultdict(list)
             for req in waiting_queue:
                 last_node_to_reqs[req.last_node].append(req)
@@ -101,8 +160,8 @@ class SchedulePolicy:
     def get_dfs_priority(
         self,
         cur_node: TreeNode,
-        node_to_priority: Dict,
-        last_node_to_reqs: Dict,
+        node_to_priority: Dict[TreeNode, int],
+        last_node_to_reqs: Dict[TreeNode, List[Req]],
         q: List,
     ):
         childs = [child for child in cur_node.children.values()]

sglang 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl

sglang 0.4.0.post1py3-none-any.whl → 0.4.1py3-none-any.whl