PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.6.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

sglang/bench_offline_throughput.py +6 -6
sglang/bench_one_batch.py +5 -4
sglang/bench_one_batch_server.py +23 -15
sglang/bench_serving.py +133 -57
sglang/compile_deep_gemm.py +4 -4
sglang/srt/configs/model_config.py +39 -28
sglang/srt/conversation.py +1 -1
sglang/srt/disaggregation/decode.py +122 -133
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
sglang/srt/disaggregation/fake/conn.py +3 -13
sglang/srt/disaggregation/kv_events.py +357 -0
sglang/srt/disaggregation/mini_lb.py +57 -24
sglang/srt/disaggregation/mooncake/conn.py +11 -2
sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
sglang/srt/disaggregation/nixl/conn.py +9 -19
sglang/srt/disaggregation/prefill.py +126 -44
sglang/srt/disaggregation/utils.py +116 -5
sglang/srt/distributed/utils.py +3 -3
sglang/srt/entrypoints/EngineBase.py +5 -0
sglang/srt/entrypoints/engine.py +28 -8
sglang/srt/entrypoints/http_server.py +6 -4
sglang/srt/entrypoints/http_server_engine.py +5 -2
sglang/srt/function_call/base_format_detector.py +250 -0
sglang/srt/function_call/core_types.py +34 -0
sglang/srt/function_call/deepseekv3_detector.py +157 -0
sglang/srt/function_call/ebnf_composer.py +234 -0
sglang/srt/function_call/function_call_parser.py +175 -0
sglang/srt/function_call/llama32_detector.py +74 -0
sglang/srt/function_call/mistral_detector.py +84 -0
sglang/srt/function_call/pythonic_detector.py +163 -0
sglang/srt/function_call/qwen25_detector.py +67 -0
sglang/srt/function_call/utils.py +35 -0
sglang/srt/hf_transformers_utils.py +46 -7
sglang/srt/layers/attention/aiter_backend.py +513 -0
sglang/srt/layers/attention/flashattention_backend.py +63 -17
sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
sglang/srt/layers/attention/flashmla_backend.py +340 -78
sglang/srt/layers/attention/triton_backend.py +3 -0
sglang/srt/layers/attention/utils.py +2 -2
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/communicator.py +451 -0
sglang/srt/layers/dp_attention.py +0 -10
sglang/srt/layers/moe/cutlass_moe.py +207 -0
sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
sglang/srt/layers/moe/ep_moe/layer.py +104 -50
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
sglang/srt/layers/moe/topk.py +66 -9
sglang/srt/layers/multimodal.py +70 -0
sglang/srt/layers/quantization/__init__.py +7 -2
sglang/srt/layers/quantization/deep_gemm.py +5 -3
sglang/srt/layers/quantization/fp8.py +90 -0
sglang/srt/layers/quantization/fp8_utils.py +6 -0
sglang/srt/layers/quantization/gptq.py +298 -6
sglang/srt/layers/quantization/int8_kernel.py +18 -5
sglang/srt/layers/quantization/qoq.py +244 -0
sglang/srt/lora/lora_manager.py +1 -3
sglang/srt/managers/deepseek_eplb.py +278 -0
sglang/srt/managers/eplb_manager.py +55 -0
sglang/srt/managers/expert_distribution.py +704 -56
sglang/srt/managers/expert_location.py +394 -0
sglang/srt/managers/expert_location_dispatch.py +91 -0
sglang/srt/managers/io_struct.py +16 -3
sglang/srt/managers/mm_utils.py +293 -139
sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
sglang/srt/managers/multimodal_processors/internvl.py +14 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
sglang/srt/managers/multimodal_processors/llava.py +3 -3
sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
sglang/srt/managers/schedule_batch.py +49 -21
sglang/srt/managers/schedule_policy.py +4 -5
sglang/srt/managers/scheduler.py +92 -50
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +99 -24
sglang/srt/mem_cache/base_prefix_cache.py +3 -0
sglang/srt/mem_cache/chunk_cache.py +3 -1
sglang/srt/mem_cache/hiradix_cache.py +4 -4
sglang/srt/mem_cache/memory_pool.py +74 -52
sglang/srt/mem_cache/multimodal_cache.py +45 -0
sglang/srt/mem_cache/radix_cache.py +58 -5
sglang/srt/metrics/collector.py +2 -2
sglang/srt/mm_utils.py +10 -0
sglang/srt/model_executor/cuda_graph_runner.py +20 -9
sglang/srt/model_executor/expert_location_updater.py +422 -0
sglang/srt/model_executor/forward_batch_info.py +4 -0
sglang/srt/model_executor/model_runner.py +144 -54
sglang/srt/model_loader/loader.py +10 -6
sglang/srt/models/clip.py +5 -1
sglang/srt/models/deepseek_v2.py +297 -343
sglang/srt/models/exaone.py +8 -3
sglang/srt/models/gemma3_mm.py +70 -33
sglang/srt/models/llama4.py +10 -2
sglang/srt/models/llava.py +26 -18
sglang/srt/models/mimo_mtp.py +220 -0
sglang/srt/models/minicpmo.py +5 -12
sglang/srt/models/mistral.py +71 -1
sglang/srt/models/mllama.py +3 -3
sglang/srt/models/qwen2.py +95 -26
sglang/srt/models/qwen2_5_vl.py +8 -0
sglang/srt/models/qwen2_moe.py +330 -60
sglang/srt/models/qwen2_vl.py +6 -0
sglang/srt/models/qwen3.py +52 -10
sglang/srt/models/qwen3_moe.py +411 -48
sglang/srt/models/siglip.py +294 -0
sglang/srt/openai_api/adapter.py +28 -16
sglang/srt/openai_api/protocol.py +6 -0
sglang/srt/operations.py +154 -0
sglang/srt/operations_strategy.py +31 -0
sglang/srt/server_args.py +134 -24
sglang/srt/speculative/eagle_utils.py +131 -0
sglang/srt/speculative/eagle_worker.py +47 -2
sglang/srt/utils.py +68 -12
sglang/test/test_cutlass_moe.py +278 -0
sglang/test/test_utils.py +2 -36
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +20 -11
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +128 -102
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
sglang/srt/function_call_parser.py +0 -858
sglang/srt/platforms/interface.py +0 -371
/sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -46,7 +46,6 @@ class ServerArgs:
     tokenizer_path: Optional[str] = None
     tokenizer_mode: str = "auto"
     skip_tokenizer_init: bool = False
-    enable_tokenizer_batch_encode: bool = False
     load_format: str = "auto"
     trust_remote_code: bool = False
     dtype: str = "auto"
@@ -59,6 +58,7 @@ class ServerArgs:
     chat_template: Optional[str] = None
     completion_template: Optional[str] = None
     is_embedding: bool = False
+    enable_multimodal: Optional[bool] = None
     revision: Optional[str] = None
     # Port for the HTTP server
@@ -97,8 +97,13 @@ class ServerArgs:
     log_requests_level: int = 0
     show_time_cost: bool = False
     enable_metrics: bool = False
+    bucket_time_to_first_token: Optional[List[float]] = None
+    bucket_e2e_request_latency: Optional[List[float]] = None
+    bucket_inter_token_latency: Optional[List[float]] = None
+    collect_tokens_histogram: bool = False
     decode_log_interval: int = 40
     enable_request_time_stats_logging: bool = False
+    kv_events_config: Optional[str] = None
     # API related
     api_key: Optional[str] = None
@@ -120,6 +125,7 @@ class ServerArgs:
     # Model override args in JSON
     json_model_override_args: str = "{}"
+    preferred_sampling_params: Optional[str] = None
     # LoRA
     lora_paths: Optional[List[str]] = None
@@ -154,9 +160,9 @@ class ServerArgs:
     disable_cuda_graph: bool = False
     disable_cuda_graph_padding: bool = False
     enable_nccl_nvls: bool = False
+    enable_tokenizer_batch_encode: bool = False
     disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
-    enable_multimodal: Optional[bool] = None
     disable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
     enable_dp_attention: bool = False
@@ -164,6 +170,17 @@ class ServerArgs:
     enable_ep_moe: bool = False
     enable_deepep_moe: bool = False
     deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
+    ep_num_redundant_experts: int = 0
+    ep_dispatch_algorithm: Optional[Literal["static", "dynamic"]] = None
+    init_expert_location: str = "trivial"
+    enable_eplb: bool = False
+    eplb_rebalance_num_iterations: int = 1000
+    expert_distribution_recorder_mode: Optional[
+        Literal["stat", "per_pass", "per_token"]
+    ] = None
+    expert_distribution_recorder_buffer_size: Optional[int] = None
+    enable_expert_distribution_metrics: bool = False
+    deepep_config: Optional[str] = None
     enable_torch_compile: bool = False
     torch_compile_max_bs: int = 32
     cuda_graph_max_bs: Optional[int] = None
@@ -229,7 +246,7 @@ class ServerArgs:
         # Set mem fraction static, which depends on the tensor parallelism size
         if self.mem_fraction_static is None:
             parallel_size = self.tp_size * self.pp_size
-            if gpu_mem <= 81920:
+            if gpu_mem is not None and gpu_mem <= 81920:
                 if parallel_size >= 16:
                     self.mem_fraction_static = 0.79
                 elif parallel_size >= 8:
@@ -242,7 +259,7 @@ class ServerArgs:
                     self.mem_fraction_static = 0.88
             else:
                 self.mem_fraction_static = 0.88
-            if gpu_mem > 96 * 1024:
+            if gpu_mem is not None and gpu_mem > 96 * 1024:
                 mem_fraction = self.mem_fraction_static
                 self.mem_fraction_static = min(
                     mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
@@ -307,12 +324,6 @@ class ServerArgs:
         if self.grammar_backend is None:
             self.grammar_backend = "xgrammar"
-        if self.pp_size > 1:
-            self.disable_overlap_schedule = True
-            logger.warning(
-                "Overlap scheduler is disabled because of using pipeline parallelism."
-            )
         # Data parallelism attention
         if self.enable_dp_attention:
             self.schedule_conservativeness = self.schedule_conservativeness * 0.3
@@ -354,6 +365,15 @@ class ServerArgs:
                 "Pipeline parallelism is incompatible with overlap schedule."
             )
+        if self.expert_distribution_recorder_buffer_size is None:
+            # TODO pr-chain: enable this later
+            # if (x := self.eplb_rebalance_num_iterations) is not None:
+            #     self.expert_distribution_recorder_buffer_size = x
+            if False:
+                pass
+            elif self.expert_distribution_recorder_mode is not None:
+                self.expert_distribution_recorder_buffer_size = 1000
         # Speculative Decoding
         if self.speculative_algorithm == "NEXTN":
             # NEXTN shares the same implementation of EAGLE
@@ -474,11 +494,6 @@ class ServerArgs:
             action="store_true",
             help="If set, skip init tokenizer and pass input_ids in generate request.",
         )
-        parser.add_argument(
-            "--enable-tokenizer-batch-encode",
-            action="store_true",
-            help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
-        )
         parser.add_argument(
             "--load-format",
             type=str,
@@ -556,6 +571,7 @@ class ServerArgs:
                 "w8a8_int8",
                 "w8a8_fp8",
                 "moe_wna16",
+                "qoq",
             ],
             help="The quantization method.",
         )
@@ -603,6 +619,12 @@ class ServerArgs:
             action="store_true",
             help="Whether to use a CausalLM as an embedding model.",
         )
+        parser.add_argument(
+            "--enable-multimodal",
+            default=ServerArgs.enable_multimodal,
+            action="store_true",
+            help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
+        )
         parser.add_argument(
             "--revision",
             type=str,
@@ -780,6 +802,39 @@ class ServerArgs:
             action="store_true",
             help="Enable log prometheus metrics.",
         )
+        parser.add_argument(
+            "--bucket-time-to-first-token",
+            type=float,
+            nargs="+",
+            default=ServerArgs.bucket_time_to_first_token,
+            help="The buckets of time to first token, specified as a list of floats.",
+        )
+        parser.add_argument(
+            "--bucket-inter-token-latency",
+            type=float,
+            nargs="+",
+            default=ServerArgs.bucket_inter_token_latency,
+            help="The buckets of inter-token latency, specified as a list of floats.",
+        )
+        parser.add_argument(
+            "--bucket-e2e-request-latency",
+            type=float,
+            nargs="+",
+            default=ServerArgs.bucket_e2e_request_latency,
+            help="The buckets of end-to-end request latency, specified as a list of floats.",
+        )
+        parser.add_argument(
+            "--collect-tokens-histogram",
+            action="store_true",
+            default=ServerArgs.collect_tokens_histogram,
+            help="Collect prompt/generation tokens histogram.",
+        )
+        parser.add_argument(
+            "--kv-events-config",
+            type=str,
+            default=None,
+            help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
+        )
         parser.add_argument(
             "--decode-log-interval",
             type=int,
@@ -868,6 +923,11 @@ class ServerArgs:
             help="A dictionary in JSON string format used to override default model configurations.",
             default=ServerArgs.json_model_override_args,
         )
+        parser.add_argument(
+            "--preferred-sampling-params",
+            type=str,
+            help="json-formatted sampling settings that will be returned in /get_model_info",
+        )
         # LoRA
         parser.add_argument(
@@ -896,6 +956,7 @@ class ServerArgs:
             "--attention-backend",
             type=str,
             choices=[
+                "aiter",
                 "flashinfer",
                 "triton",
                 "torch_native",
@@ -1043,6 +1104,11 @@ class ServerArgs:
             action="store_true",
             help="Enable NCCL NVLS for prefill heavy requests when available.",
         )
+        parser.add_argument(
+            "--enable-tokenizer-batch-encode",
+            action="store_true",
+            help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
+        )
         parser.add_argument(
             "--disable-outlines-disk-cache",
             action="store_true",
@@ -1053,12 +1119,6 @@ class ServerArgs:
             action="store_true",
             help="Disable the custom all-reduce kernel and fall back to NCCL.",
         )
-        parser.add_argument(
-            "--enable-multimodal",
-            default=ServerArgs.enable_multimodal,
-            action="store_true",
-            help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
-        )
         parser.add_argument(
             "--disable-overlap-schedule",
             action="store_true",
@@ -1072,7 +1132,7 @@ class ServerArgs:
         parser.add_argument(
             "--enable-dp-attention",
             action="store_true",
-            help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
+            help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently DeepSeek-V2 and Qwen 2/3 MoE models are supported.",
         )
         parser.add_argument(
             "--enable-dp-lm-head",
@@ -1212,6 +1272,58 @@ class ServerArgs:
             default="auto",
             help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
         )
+        parser.add_argument(
+            "--ep-num-redundant-experts",
+            type=int,
+            default=ServerArgs.ep_num_redundant_experts,
+            help="Allocate this number of redundant experts in expert parallel.",
+        )
+        parser.add_argument(
+            "--ep-dispatch-algorithm",
+            type=str,
+            default=ServerArgs.ep_dispatch_algorithm,
+            help="The algorithm to choose ranks for redundant experts in expert parallel.",
+        )
+        parser.add_argument(
+            "--init-expert-location",
+            type=str,
+            default=ServerArgs.init_expert_location,
+            help="Initial location of EP experts.",
+        )
+        parser.add_argument(
+            "--enable-eplb",
+            action="store_true",
+            help="Enable EPLB algorithm",
+        )
+        parser.add_argument(
+            "--eplb-rebalance-num-iterations",
+            type=int,
+            default=ServerArgs.eplb_rebalance_num_iterations,
+            help="Number of iterations to automatically trigger a EPLB re-balance.",
+        )
+        parser.add_argument(
+            "--expert-distribution-recorder-mode",
+            type=str,
+            default=ServerArgs.expert_distribution_recorder_mode,
+            help="Mode of expert distribution recorder.",
+        )
+        parser.add_argument(
+            "--expert-distribution-recorder-buffer-size",
+            type=int,
+            default=ServerArgs.expert_distribution_recorder_buffer_size,
+            help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
+        )
+        parser.add_argument(
+            "--enable-expert-distribution-metrics",
+            action="store_true",
+            help="Enable logging metrics for expert balancedness",
+        )
+        parser.add_argument(
+            "--deepep-config",
+            type=str,
+            default=ServerArgs.deepep_config,
+            help="Tuned DeepEP config suitable for your own cluster.",
+        )
         parser.add_argument(
             "--n-share-experts-fusion",
@@ -1326,8 +1438,6 @@ class ServerArgs:
         # FIXME pp constraints
         if self.pp_size > 1:
-            logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
-            self.disable_overlap_schedule = True
             assert (
                 self.disable_overlap_schedule
                 and self.speculative_algorithm is None

sglang/srt/speculative/eagle_utils.py CHANGED Viewed

@@ -9,15 +9,18 @@ import torch.nn.functional as F
 import triton
 import triton.language as tl
+from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
 from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import (
+    Req,
     ScheduleBatch,
     get_last_loc,
     global_server_args_dict,
 )
 from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
 from sglang.srt.utils import fast_topk, is_cuda, is_hip, next_power_of_2
@@ -187,6 +190,7 @@ class EagleVerifyInput:
     draft_token_num: int
     spec_steps: int
     capture_hidden_mode: CaptureHiddenMode
+    grammar: BaseGrammarObject = None
     @classmethod
     def create(
@@ -307,6 +311,7 @@ class EagleVerifyInput:
         logits_output: torch.Tensor,
         token_to_kv_pool_allocator: TokenToKVPoolAllocator,
         page_size: int,
+        vocab_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """
         Verify and find accepted tokens based on logits output and batch
@@ -343,6 +348,13 @@ class EagleVerifyInput:
                 torch.repeat_interleave(linear_penalty, self.draft_token_num, dim=0)
             )
+        # Apply grammar mask
+        if vocab_mask is not None:
+            assert self.grammar is not None
+            self.grammar.apply_vocab_mask(
+                logits=logits_output.next_token_logits, vocab_mask=vocab_mask
+            )
         # Sample tokens
         if batch.sampling_info.is_all_greedy:
             target_predict = torch.argmax(logits_output.next_token_logits, dim=-1)
@@ -440,6 +452,15 @@ class EagleVerifyInput:
                     break
                 else:
                     new_accept_index_.append(idx)
+                    # update grammar state
+                    if req.grammar is not None:
+                        try:
+                            req.grammar.accept_token(id)
+                        except ValueError as e:
+                            logger.info(
+                                f"{i=}, {req=}\n" f"{accept_index=}\n" f"{predict=}\n"
+                            )
+                            raise e
             if not req.finished():
                 new_accept_index.extend(new_accept_index_)
                 unfinished_index.append(i)
@@ -801,3 +822,113 @@ def _generate_simulated_accept_index(
     accept_length.fill_(simulate_acc_len - 1)
     predict.fill_(100)  # some legit token id
     return sim_accept_index
+def traverse_tree(
+    retrieve_next_token: torch.Tensor,
+    retrieve_next_sibling: torch.Tensor,
+    draft_tokens: torch.Tensor,
+    grammar: BaseGrammarObject,
+    allocate_token_bitmask: torch.Tensor,
+):
+    """
+    Traverse the tree constructed by the draft model to generate the logits mask.
+    """
+    assert (
+        retrieve_next_token.shape == retrieve_next_sibling.shape == draft_tokens.shape
+    )
+    allocate_token_bitmask.fill_(0)
+    def dfs(
+        curr: int,
+        retrieve_next_token: torch.Tensor,
+        retrieve_next_sibling: torch.Tensor,
+        parent_pos: int,
+    ):
+        if curr == 0:
+            # the first token generated by the target model, and thus it is always
+            # accepted from the previous iteration
+            accepted = True
+        else:
+            parent_bitmask = allocate_token_bitmask[parent_pos]
+            curr_token_id = draft_tokens[curr]
+            # 32 boolean bitmask values are packed into 32-bit integers
+            accepted = (
+                parent_bitmask[curr_token_id // 32] & (1 << (curr_token_id % 32))
+            ) != 0
+        if accepted:
+            if curr != 0:
+                # Accept the current token
+                grammar.accept_token(draft_tokens[curr])
+            if not grammar.is_terminated():
+                # Generate the bitmask for the current token
+                grammar.fill_vocab_mask(allocate_token_bitmask, curr)
+                if retrieve_next_token[curr] != -1:
+                    # Visit the child node
+                    dfs(
+                        retrieve_next_token[curr],
+                        retrieve_next_token,
+                        retrieve_next_sibling,
+                        curr,
+                    )
+            if curr != 0:
+                # Rollback the current token
+                grammar.rollback(1)
+        if retrieve_next_sibling[curr] != -1:
+            # Visit the sibling node
+            dfs(
+                retrieve_next_sibling[curr],
+                retrieve_next_token,
+                retrieve_next_sibling,
+                parent_pos,
+            )
+    dfs(0, retrieve_next_token, retrieve_next_sibling, -1)
+def generate_token_bitmask(
+    reqs: List[Req],
+    verify_input: EagleVerifyInput,
+    retrieve_next_token_cpu: torch.Tensor,
+    retrieve_next_sibling_cpu: torch.Tensor,
+    draft_tokens_cpu: torch.Tensor,
+    vocab_size: int,
+):
+    """
+    Generate the logit mask for structured output.
+    Draft model's token can be either valid or invalid with respect to the grammar.
+    We need to perform DFS to figure out:
+    1. which tokens are accepted by the grammar
+    2. what is the corresponding logit mask.
+    """
+    num_draft_tokens = draft_tokens_cpu.shape[-1]
+    allocate_token_bitmask = None
+    assert len(reqs) == retrieve_next_token_cpu.shape[0]
+    grammar = None
+    for i, req in enumerate(reqs):
+        if req.grammar is not None:
+            if allocate_token_bitmask is None:
+                allocate_token_bitmask = req.grammar.allocate_vocab_mask(
+                    vocab_size=vocab_size,
+                    batch_size=draft_tokens_cpu.numel(),
+                    device="cpu",
+                )
+            grammar = req.grammar
+            traverse_tree(
+                retrieve_next_token_cpu[i],
+                retrieve_next_sibling_cpu[i],
+                draft_tokens_cpu[i],
+                req.grammar,
+                allocate_token_bitmask[
+                    i * num_draft_tokens : (i + 1) * num_draft_tokens
+                ],
+            )
+    verify_input.grammar = grammar
+    return allocate_token_bitmask

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -31,6 +31,7 @@ from sglang.srt.speculative.eagle_utils import (
     EagleVerifyInput,
     EagleVerifyOutput,
     assign_draft_cache_locs,
+    generate_token_bitmask,
     select_top_k_tokens,
 )
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
@@ -199,6 +200,19 @@ class EAGLEWorker(TpModelWorker):
             self.draft_extend_attn_backend = None
             self.padded_static_len = self.speculative_num_steps + 1
             self.has_prefill_wrapper_verify = False
+        elif self.server_args.attention_backend == "flashmla":
+            from sglang.srt.layers.attention.flashmla_backend import (
+                FlashMLAMultiStepDraftBackend,
+            )
+            self.draft_attn_backend = FlashMLAMultiStepDraftBackend(
+                self.draft_model_runner,
+                self.topk,
+                self.speculative_num_steps,
+            )
+            self.draft_extend_attn_backend = None
+            self.padded_static_len = self.speculative_num_steps + 1
+            self.has_prefill_wrapper_verify = False
         else:
             raise ValueError(
                 f"EAGLE is not supported in attention backend {self.server_args.attention_backend}"
@@ -215,7 +229,7 @@ class EAGLEWorker(TpModelWorker):
             return
         # Capture draft
-        tic = time.time()
+        tic = time.perf_counter()
         before_mem = get_available_gpu_memory(self.device, self.gpu_id)
         logger.info(
             f"Capture draft cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
@@ -223,7 +237,7 @@ class EAGLEWorker(TpModelWorker):
         self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
         after_mem = get_available_gpu_memory(self.device, self.gpu_id)
         logger.info(
-            f"Capture draft cuda graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
+            f"Capture draft cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
         )
         # Capture extend
@@ -479,11 +493,41 @@ class EAGLEWorker(TpModelWorker):
         batch.forward_mode = ForwardMode.TARGET_VERIFY
         batch.spec_info = spec_info
         model_worker_batch = batch.get_model_worker_batch()
+        if batch.has_grammar:
+            retrieve_next_token_cpu = spec_info.retrive_next_token.cpu()
+            retrieve_next_sibling_cpu = spec_info.retrive_next_sibling.cpu()
+            draft_tokens_cpu = spec_info.draft_token.view(
+                spec_info.retrive_next_token.shape
+            ).cpu()
+        # Forward
         logits_output, _, can_run_cuda_graph = (
             self.target_worker.forward_batch_generation(
                 model_worker_batch, skip_sample=True
             )
         )
+        vocab_mask = None
+        if batch.has_grammar:
+            # Generate the logit mask for structured output.
+            # Overlap the CPU operations for bitmask generation with the forward pass.
+            vocab_mask = generate_token_bitmask(
+                batch.reqs,
+                spec_info,
+                retrieve_next_token_cpu,
+                retrieve_next_sibling_cpu,
+                draft_tokens_cpu,
+                batch.sampling_info.vocab_size,
+            )
+            if vocab_mask is not None:
+                assert spec_info.grammar is not None
+                vocab_mask = vocab_mask.to(spec_info.retrive_next_token.device)
+                # otherwise, this vocab mask will be the one from the previous extend stage
+                # and will be applied to produce wrong results
+                batch.sampling_info.vocab_mask = None
         self._detect_nan_if_needed(logits_output)
         spec_info.hidden_states = logits_output.hidden_states
         res: EagleVerifyOutput = spec_info.verify(
@@ -491,6 +535,7 @@ class EAGLEWorker(TpModelWorker):
             logits_output,
             self.token_to_kv_pool_allocator,
             self.page_size,
+            vocab_mask,
         )
         # Post process based on verified outputs.

sglang 0.4.6.post4__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.6.post5py3-none-any.whl