PyPI - sglang - Versions diffs - 0.4.4__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl - Mend

sglang 0.4.4py3-none-any.whl → 0.4.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (176) hide show

sglang/__init__.py +2 -0
sglang/api.py +6 -0
sglang/bench_one_batch.py +1 -1
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +3 -1
sglang/check_env.py +3 -4
sglang/lang/backend/openai.py +18 -5
sglang/lang/chat_template.py +28 -7
sglang/lang/interpreter.py +7 -3
sglang/lang/ir.py +10 -0
sglang/srt/_custom_ops.py +1 -1
sglang/srt/code_completion_parser.py +174 -0
sglang/srt/configs/__init__.py +2 -6
sglang/srt/configs/deepseekvl2.py +667 -0
sglang/srt/configs/janus_pro.py +3 -4
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +63 -11
sglang/srt/configs/utils.py +25 -0
sglang/srt/connector/__init__.py +51 -0
sglang/srt/connector/base_connector.py +112 -0
sglang/srt/connector/redis.py +85 -0
sglang/srt/connector/s3.py +122 -0
sglang/srt/connector/serde/__init__.py +31 -0
sglang/srt/connector/serde/safe_serde.py +29 -0
sglang/srt/connector/serde/serde.py +43 -0
sglang/srt/connector/utils.py +35 -0
sglang/srt/conversation.py +88 -0
sglang/srt/disaggregation/conn.py +81 -0
sglang/srt/disaggregation/decode.py +495 -0
sglang/srt/disaggregation/mini_lb.py +285 -0
sglang/srt/disaggregation/prefill.py +249 -0
sglang/srt/disaggregation/utils.py +44 -0
sglang/srt/distributed/parallel_state.py +10 -3
sglang/srt/entrypoints/engine.py +55 -5
sglang/srt/entrypoints/http_server.py +71 -12
sglang/srt/function_call_parser.py +164 -54
sglang/srt/hf_transformers_utils.py +28 -3
sglang/srt/layers/activation.py +4 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +295 -0
sglang/srt/layers/attention/flashinfer_backend.py +1 -1
sglang/srt/layers/attention/flashmla_backend.py +284 -0
sglang/srt/layers/attention/triton_backend.py +171 -38
sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
sglang/srt/layers/attention/utils.py +53 -0
sglang/srt/layers/attention/vision.py +9 -28
sglang/srt/layers/dp_attention.py +62 -23
sglang/srt/layers/elementwise.py +411 -0
sglang/srt/layers/layernorm.py +24 -2
sglang/srt/layers/linear.py +17 -5
sglang/srt/layers/logits_processor.py +26 -7
sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
sglang/srt/layers/moe/ep_moe/layer.py +273 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
sglang/srt/layers/moe/router.py +342 -0
sglang/srt/layers/moe/topk.py +31 -18
sglang/srt/layers/parameter.py +1 -1
sglang/srt/layers/quantization/__init__.py +184 -126
sglang/srt/layers/quantization/base_config.py +5 -0
sglang/srt/layers/quantization/blockwise_int8.py +1 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
sglang/srt/layers/quantization/fp8.py +76 -34
sglang/srt/layers/quantization/fp8_kernel.py +24 -8
sglang/srt/layers/quantization/fp8_utils.py +284 -28
sglang/srt/layers/quantization/gptq.py +36 -9
sglang/srt/layers/quantization/kv_cache.py +98 -0
sglang/srt/layers/quantization/modelopt_quant.py +9 -7
sglang/srt/layers/quantization/utils.py +153 -0
sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
sglang/srt/layers/rotary_embedding.py +66 -87
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/layers.py +68 -0
sglang/srt/lora/lora.py +2 -22
sglang/srt/lora/lora_manager.py +47 -23
sglang/srt/lora/mem_pool.py +110 -51
sglang/srt/lora/utils.py +12 -1
sglang/srt/managers/cache_controller.py +4 -5
sglang/srt/managers/data_parallel_controller.py +31 -9
sglang/srt/managers/expert_distribution.py +81 -0
sglang/srt/managers/io_struct.py +39 -3
sglang/srt/managers/mm_utils.py +373 -0
sglang/srt/managers/multimodal_processor.py +68 -0
sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
sglang/srt/managers/schedule_batch.py +134 -31
sglang/srt/managers/scheduler.py +325 -38
sglang/srt/managers/scheduler_output_processor_mixin.py +4 -1
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +59 -23
sglang/srt/managers/tp_worker.py +1 -1
sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
sglang/srt/managers/utils.py +6 -1
sglang/srt/mem_cache/hiradix_cache.py +27 -8
sglang/srt/mem_cache/memory_pool.py +258 -98
sglang/srt/mem_cache/paged_allocator.py +2 -2
sglang/srt/mem_cache/radix_cache.py +4 -4
sglang/srt/model_executor/cuda_graph_runner.py +85 -28
sglang/srt/model_executor/forward_batch_info.py +81 -15
sglang/srt/model_executor/model_runner.py +70 -6
sglang/srt/model_loader/loader.py +160 -2
sglang/srt/model_loader/weight_utils.py +45 -0
sglang/srt/models/deepseek_janus_pro.py +29 -86
sglang/srt/models/deepseek_nextn.py +22 -10
sglang/srt/models/deepseek_v2.py +326 -192
sglang/srt/models/deepseek_vl2.py +358 -0
sglang/srt/models/gemma3_causal.py +684 -0
sglang/srt/models/gemma3_mm.py +462 -0
sglang/srt/models/grok.py +374 -119
sglang/srt/models/llama.py +47 -7
sglang/srt/models/llama_eagle.py +1 -0
sglang/srt/models/llama_eagle3.py +196 -0
sglang/srt/models/llava.py +3 -3
sglang/srt/models/llavavid.py +3 -3
sglang/srt/models/minicpmo.py +1995 -0
sglang/srt/models/minicpmv.py +62 -137
sglang/srt/models/mllama.py +4 -4
sglang/srt/models/phi3_small.py +1 -1
sglang/srt/models/qwen2.py +3 -0
sglang/srt/models/qwen2_5_vl.py +68 -146
sglang/srt/models/qwen2_classification.py +75 -0
sglang/srt/models/qwen2_moe.py +9 -1
sglang/srt/models/qwen2_vl.py +25 -63
sglang/srt/openai_api/adapter.py +145 -47
sglang/srt/openai_api/protocol.py +23 -2
sglang/srt/sampling/sampling_batch_info.py +1 -1
sglang/srt/sampling/sampling_params.py +6 -6
sglang/srt/server_args.py +104 -14
sglang/srt/speculative/build_eagle_tree.py +7 -347
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
sglang/srt/speculative/eagle_utils.py +208 -252
sglang/srt/speculative/eagle_worker.py +139 -53
sglang/srt/speculative/spec_info.py +6 -1
sglang/srt/torch_memory_saver_adapter.py +22 -0
sglang/srt/utils.py +182 -21
sglang/test/__init__.py +0 -0
sglang/test/attention/__init__.py +0 -0
sglang/test/attention/test_flashattn_backend.py +312 -0
sglang/test/runners.py +2 -0
sglang/test/test_activation.py +2 -1
sglang/test/test_block_fp8.py +5 -4
sglang/test/test_block_fp8_ep.py +2 -1
sglang/test/test_dynamic_grad_mode.py +58 -0
sglang/test/test_layernorm.py +3 -2
sglang/test/test_utils.py +55 -4
sglang/utils.py +31 -0
sglang/version.py +1 -1
{sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
{sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +171 -125
{sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
sglang/srt/managers/image_processor.py +0 -55
sglang/srt/managers/image_processors/base_image_processor.py +0 -219
sglang/srt/managers/image_processors/minicpmv.py +0 -86
sglang/srt/managers/multi_modality_padding.py +0 -134
{sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
{sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import argparse
 import dataclasses
 import logging
+import os
 import random
 import tempfile
 from typing import List, Optional
@@ -24,12 +25,14 @@ from sglang.srt.hf_transformers_utils import check_gguf_file
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
     get_amdgpu_memory_capacity,
+    get_device,
     get_hpu_memory_capacity,
     get_nvgpu_memory_capacity,
     is_cuda,
     is_flashinfer_available,
     is_hip,
     is_port_available,
+    is_remote_url,
     is_valid_ipv6_address,
     nullable_str,
 )
@@ -51,9 +54,10 @@ class ServerArgs:
     quantization: Optional[str] = None
     quantization_param_path: nullable_str = None
     context_length: Optional[int] = None
-    device: str = "cuda"
+    device: Optional[str] = None
     served_model_name: Optional[str] = None
     chat_template: Optional[str] = None
+    completion_template: Optional[str] = None
     is_embedding: bool = False
     revision: Optional[str] = None
@@ -122,7 +126,7 @@ class ServerArgs:
     # Kernel backend
     attention_backend: Optional[str] = None
     sampling_backend: Optional[str] = None
-    grammar_backend: Optional[str] = "outlines"
+    grammar_backend: Optional[str] = "xgrammar"
     # Speculative decoding
     speculative_algorithm: Optional[str] = None
@@ -154,6 +158,7 @@ class ServerArgs:
     enable_mixed_chunk: bool = False
     enable_dp_attention: bool = False
     enable_ep_moe: bool = False
+    enable_deepep_moe: bool = False
     enable_torch_compile: bool = False
     torch_compile_max_bs: int = 32
     cuda_graph_max_bs: Optional[int] = None
@@ -170,7 +175,9 @@ class ServerArgs:
     enable_custom_logit_processor: bool = False
     tool_call_parser: str = None
     enable_hierarchical_cache: bool = False
+    hicache_ratio: float = 2.0
     enable_flashinfer_mla: bool = False
+    enable_flashmla: bool = False
     flashinfer_mla_disable_ragged: bool = False
     warmups: Optional[str] = None
@@ -179,11 +186,18 @@ class ServerArgs:
     debug_tensor_dump_input_file: Optional[str] = None
     debug_tensor_dump_inject: bool = False
+    # For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
+    disaggregation_mode: str = "null"
+    disaggregation_bootstrap_port: int = 8998
     def __post_init__(self):
         # Set missing default values
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
+        if self.device is None:
+            self.device = get_device()
         if self.served_model_name is None:
             self.served_model_name = self.model_path
@@ -222,6 +236,11 @@ class ServerArgs:
         assert self.chunked_prefill_size % self.page_size == 0
+        if self.enable_flashmla is True:
+            logger.warning(
+                "FlashMLA only supports a page_size of 64, change page_size to 64."
+            )
+            self.page_size = 64
         # Set cuda graph max batch size
         if self.cuda_graph_max_bs is None:
             # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
@@ -262,25 +281,33 @@ class ServerArgs:
         # Data parallelism attention
         if self.enable_dp_attention:
-            self.dp_size = self.tp_size
-            assert self.tp_size % self.dp_size == 0
-            self.chunked_prefill_size = self.chunked_prefill_size // 2
             self.schedule_conservativeness = self.schedule_conservativeness * 0.3
+            assert (
+                self.dp_size > 1
+            ), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
+            assert self.tp_size % self.dp_size == 0
+            self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
             logger.warning(
                 f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
-                f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
-                "Data parallel size is adjusted to be the same as tensor parallel size. "
             )
+            # DeepEP MoE
+            if self.enable_deepep_moe:
+                self.ep_size = self.dp_size
+                logger.info(
+                    f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the data parallel size[{self.dp_size}]."
+                )
         # Speculative Decoding
         if self.speculative_algorithm == "NEXTN":
             # NEXTN shares the same implementation of EAGLE
             self.speculative_algorithm = "EAGLE"
-        if self.speculative_algorithm == "EAGLE":
+        if (
+            self.speculative_algorithm == "EAGLE"
+            or self.speculative_algorithm == "EAGLE3"
+        ):
             if self.max_running_requests is None:
                 self.max_running_requests = 32
-            self.disable_cuda_graph_padding = True
             self.disable_overlap_schedule = True
             logger.info(
                 "Overlap scheduler is disabled because of using "
@@ -296,10 +323,29 @@ class ServerArgs:
         ) and check_gguf_file(self.model_path):
             self.quantization = self.load_format = "gguf"
+        if is_remote_url(self.model_path):
+            self.load_format = "remote"
         # AMD-specific Triton attention KV splits default number
         if is_hip():
             self.triton_attention_num_kv_splits = 16
+        # PD disaggregation
+        if self.disaggregation_mode == "prefill":
+            self.disable_cuda_graph = True
+            logger.warning("KV cache is forced as chunk cache for decode server")
+            self.disable_overlap_schedule = True
+            logger.warning("Overlap scheduler is disabled for prefill server")
+        elif self.disaggregation_mode == "decode":
+            self.disable_radix_cache = True
+            logger.warning("Cuda graph is disabled for prefill server")
+            self.disable_overlap_schedule = True
+            logger.warning("Overlap scheduler is disabled for decode server")
+        os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
+            "1" if self.enable_torch_compile else "0"
+        )
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         # Model and port args
@@ -345,9 +391,11 @@ class ServerArgs:
                 "safetensors",
                 "npcache",
                 "dummy",
+                "sharded_state",
                 "gguf",
                 "bitsandbytes",
                 "layered",
+                "remote",
             ],
             help="The format of the model weights to load. "
             '"auto" will try to load the weights in the safetensors format '
@@ -429,9 +477,8 @@ class ServerArgs:
         parser.add_argument(
             "--device",
             type=str,
-            default="cuda",
-            choices=["cuda", "xpu", "hpu", "cpu"],
-            help="The device type.",
+            default=ServerArgs.device,
+            help="The device to use ('cuda', 'xpu', 'hpu', 'cpu'). Defaults to auto-detection if not specified.",
         )
         parser.add_argument(
             "--served-model-name",
@@ -445,6 +492,12 @@ class ServerArgs:
             default=ServerArgs.chat_template,
             help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
         )
+        parser.add_argument(
+            "--completion-template",
+            type=str,
+            default=ServerArgs.completion_template,
+            help="The buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently.",
+        )
         parser.add_argument(
             "--is-embedding",
             action="store_true",
@@ -722,7 +775,7 @@ class ServerArgs:
         parser.add_argument(
             "--attention-backend",
             type=str,
-            choices=["flashinfer", "triton", "torch_native"],
+            choices=["flashinfer", "triton", "torch_native", "fa3"],
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )
@@ -745,6 +798,11 @@ class ServerArgs:
             action="store_true",
             help="Enable FlashInfer MLA optimization",
         )
+        parser.add_argument(
+            "--enable-flashmla",
+            action="store_true",
+            help="Enable FlashMLA decode optimization",
+        )
         parser.add_argument(
             "--flashinfer-mla-disable-ragged",
             action="store_true",
@@ -755,7 +813,7 @@ class ServerArgs:
         parser.add_argument(
             "--speculative-algorithm",
             type=str,
-            choices=["EAGLE", "NEXTN"],
+            choices=["EAGLE", "EAGLE3", "NEXTN"],
             help="Speculative algorithm.",
         )
         parser.add_argument(
@@ -984,6 +1042,18 @@ class ServerArgs:
             action="store_true",
             help="Enable hierarchical cache",
         )
+        parser.add_argument(
+            "--hicache-ratio",
+            type=float,
+            required=False,
+            default=ServerArgs.hicache_ratio,
+            help="The ratio of the size of host KV cache memory pool to the size of device pool.",
+        )
+        parser.add_argument(
+            "--enable-deepep-moe",
+            action="store_true",
+            help="Enabling DeepEP MoE implementation for EP MoE.",
+        )
         # Server warmups
         parser.add_argument(
@@ -1014,6 +1084,21 @@ class ServerArgs:
             help="Inject the outputs from jax as the input of every layer.",
         )
+        # Disaggregation
+        parser.add_argument(
+            "--disaggregation-mode",
+            type=str,
+            default="null",
+            choices=["null", "prefill", "decode"],
+            help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
+        )
+        parser.add_argument(
+            "--disaggregation-bootstrap-port",
+            type=int,
+            default=ServerArgs.disaggregation_bootstrap_port,
+            help="Bootstrap server port on the prefill server. Default is 8998.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         args.tp_size = args.tensor_parallel_size
@@ -1088,6 +1173,9 @@ class PortArgs:
     # The port for nccl initialization (torch.dist)
     nccl_port: int
+    # The ipc filename for rpc call between Engine and Scheduler
+    rpc_ipc_name: str
     @staticmethod
     def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
         port = server_args.port + random.randint(100, 1000)
@@ -1106,6 +1194,7 @@ class PortArgs:
                 scheduler_input_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
                 detokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
                 nccl_port=port,
+                rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
             )
         else:
             # DP attention. Use TCP + port to handle both single-node and multi-node.
@@ -1131,6 +1220,7 @@ class PortArgs:
                 scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
                 detokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base + 1}",
                 nccl_port=port,
+                rpc_ipc_name=f"tcp://{dist_init_host}:{port_base + 2}",
             )

sglang/srt/speculative/build_eagle_tree.py CHANGED Viewed

@@ -3,8 +3,13 @@
 from typing import List
 import torch
-from sgl_kernel import build_tree_kernel as sgl_build_tree_kernel
-from sgl_kernel import build_tree_kernel_efficient as sgl_build_tree_kernel_efficient
+from sglang.srt.utils import is_cuda_available, is_hip
+if is_cuda_available() or is_hip():
+    from sgl_kernel import (
+        build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
+    )
 def build_tree_kernel_efficient_preprocess(
@@ -23,7 +28,6 @@ def build_tree_kernel_efficient_preprocess(
     top_scores = torch.topk(score_list, num_verify_tokens - 1, dim=-1)
     top_scores_index = top_scores.indices
     top_scores_index = torch.sort(top_scores_index).values
     draft_tokens = torch.gather(ss_token_list, index=top_scores_index, dim=1)
     draft_tokens = torch.cat((verified_id.unsqueeze(1), draft_tokens), dim=1).flatten()
@@ -108,296 +112,6 @@ def build_tree_kernel_efficient(
     )
-def build_tree_kernel(
-    verified_id: torch.Tensor,
-    score_list: List[torch.Tensor],
-    token_list: List[torch.Tensor],
-    parents_list: List[torch.Tensor],
-    seq_lens: torch.Tensor,
-    seq_lens_sum: int,
-    topk: int,
-    spec_steps: int,
-    num_verify_tokens: int,
-):
-    parent_list, top_scores_index, draft_tokens = (
-        build_tree_kernel_efficient_preprocess(
-            verified_id,
-            score_list,
-            token_list,
-            parents_list,
-            num_verify_tokens,
-        )
-    )
-    bs = seq_lens.numel()
-    device = seq_lens.device
-    tree_mask = torch.full(
-        (
-            seq_lens_sum * num_verify_tokens
-            + num_verify_tokens * num_verify_tokens * bs,
-        ),
-        True,
-        device=device,
-    )
-    retrive_index = torch.full(
-        (bs, num_verify_tokens, spec_steps + 2), -1, device=device, dtype=torch.long
-    )
-    positions = torch.empty((bs * num_verify_tokens,), device=device, dtype=torch.long)
-    sgl_build_tree_kernel(
-        parent_list,
-        top_scores_index,
-        seq_lens.to(torch.int32),
-        tree_mask,
-        positions,
-        retrive_index,
-        topk,
-        spec_steps,
-        num_verify_tokens,
-    )
-    index = retrive_index.sum(dim=-1) != -spec_steps - 2
-    cum_len = torch.cumsum(torch.sum(index, dim=-1), dim=-1)
-    retrive_cum_len = torch.zeros(
-        (cum_len.numel() + 1,), dtype=torch.int32, device="cuda"
-    )
-    retrive_cum_len[1:] = cum_len
-    # TODO: this indexing cause a synchronization, optimize this
-    retrive_index = retrive_index[index]
-    return tree_mask, positions, retrive_index, retrive_cum_len, draft_tokens
-def test_build_tree_kernel():
-    def findp(p_i, index, parent_list):
-        pos = index // 10
-        index_list = index.tolist()
-        parent_list = parent_list.tolist()
-        res = [p_i]
-        while True:
-            p = pos[p_i]
-            if p == 0:
-                break
-            token_idx = parent_list[p]
-            p_i = index_list.index(token_idx)
-            res.append(p_i)
-        return res
-    def create_mask(seq_len, draft_token, index, parent_list, max_depth):
-        mask = []
-        positions = []
-        retrive_index = []
-        for i, lens in enumerate(seq_len.tolist()):
-            first_mask = torch.full((lens + draft_token,), True)
-            first_mask[-(draft_token - 1) :] = False
-            positions.append(lens)
-            mask.append(first_mask)
-            seq_order = []
-            first_index = torch.Tensor([0] + [-1] * (depth + 1)).cuda().to(torch.long)
-            r_index = [first_index]
-            for j in range(draft_token - 1):
-                mask.append(torch.full((lens + 1,), True))
-                idx = findp(j, index, parent_list)
-                seq_order.append(idx)
-                positions.append(len(idx) + seq_len)
-                t = torch.full((draft_token - 1,), False)
-                t[idx] = True
-                mask.append(t)
-            for i in range(1, draft_token - 1):
-                is_leaf = 0
-                for j in range(draft_token - 1):
-                    if i in seq_order[j]:
-                        is_leaf += 1
-                if is_leaf == 1:
-                    order_list = [0] + [x + 1 for x in seq_order[i][::-1]]
-                    for _ in range(max_depth + 1 - len(seq_order[i])):
-                        order_list.append(-1)
-                    order = torch.Tensor(order_list).cuda().to(torch.long)
-                    r_index.append(order)
-            retrive_index.append(torch.stack(r_index))
-        return (
-            torch.cat(mask).cuda(),
-            torch.Tensor(positions).cuda().to(torch.long),
-            torch.stack(retrive_index),
-        )
-    index = (
-        torch.Tensor(
-            [
-                0,
-                1,
-                2,
-                3,
-                10,
-                11,
-                12,
-                13,
-                20,
-                21,
-                22,
-                30,
-                110,
-                130,
-                150,
-                160,
-                210,
-                211,
-                212,
-                213,
-                214,
-                215,
-                216,
-                217,
-                218,
-                219,
-                220,
-                230,
-                310,
-                311,
-                312,
-                313,
-                314,
-                315,
-                316,
-                317,
-                320,
-                321,
-                322,
-                330,
-                360,
-                380,
-                390,
-                410,
-                411,
-                412,
-                413,
-                414,
-                415,
-                416,
-                417,
-                418,
-                419,
-                420,
-                421,
-                422,
-                423,
-                430,
-                431,
-                440,
-                441,
-                460,
-                470,
-            ]
-        )
-        .to(torch.long)
-        .cuda()
-    )
-    parent_list = (
-        torch.Tensor(
-            [
-                -1,
-                0,
-                1,
-                2,
-                3,
-                4,
-                5,
-                6,
-                7,
-                8,
-                9,
-                10,
-                11,
-                12,
-                20,
-                30,
-                21,
-                13,
-                22,
-                40,
-                23,
-                110,
-                130,
-                160,
-                150,
-                190,
-                120,
-                111,
-                121,
-                200,
-                180,
-                210,
-                211,
-                212,
-                213,
-                214,
-                215,
-                216,
-                220,
-                230,
-                217,
-                310,
-                311,
-                312,
-                313,
-                320,
-                314,
-                321,
-                315,
-                316,
-                317,
-            ]
-        )
-        .to(torch.long)
-        .cuda()
-    )
-    verified_seq_len = torch.Tensor([47]).to(torch.long).cuda()
-    bs = verified_seq_len.shape[0]
-    topk = 10
-    depth = 5  # depth <= 10
-    num_draft_token = 64
-    tree_mask = torch.full(
-        (
-            torch.sum(verified_seq_len).item() * num_draft_token
-            + num_draft_token * num_draft_token * bs,
-        ),
-        True,
-    ).cuda()
-    retrive_index = torch.full(
-        (bs, num_draft_token, depth + 2), -1, device="cuda", dtype=torch.long
-    )
-    positions = torch.empty((bs * num_draft_token,), device="cuda", dtype=torch.long)
-    sgl_build_tree_kernel(
-        parent_list.unsqueeze(0),
-        index.unsqueeze(0),
-        verified_seq_len,
-        tree_mask,
-        positions,
-        retrive_index,
-        topk,
-        depth,
-        num_draft_token,
-    )
-    retrive_index = retrive_index[retrive_index.sum(dim=-1) != -depth - 2]
-    c_mask, c_positions, c_retive_index = create_mask(
-        verified_seq_len, num_draft_token, index, parent_list, depth
-    )
-    assert torch.allclose(tree_mask, c_mask), "tree mask has error."
-    assert torch.allclose(positions, c_positions), "positions has error."
-    assert torch.allclose(retrive_index, c_retive_index), "retrive_index has error."
 def test_build_tree_kernel_efficient():
     verified_id = torch.tensor([29974, 13], device="cuda", dtype=torch.int32)
     score_list = [
@@ -611,59 +325,6 @@ def test_build_tree_kernel_efficient():
     depth = 4
     num_draft_token = 8
-    tree_mask, position, retrive_index, retrive_cum_len, draft_tokens = (
-        build_tree_kernel(
-            verified_id=verified_id,
-            score_list=score_list,
-            token_list=token_list,
-            parents_list=parents_list,
-            seq_lens=seq_lens,
-            seq_lens_sum=torch.sum(seq_lens).item(),
-            topk=topk,
-            spec_steps=depth,
-            num_verify_tokens=num_draft_token,
-        )
-    )
-    from sglang.srt.utils import first_rank_print
-    first_rank_print("=========== build tree kernel ==========")
-    # first_rank_print(f"{tree_mask=}", flush=True)
-    first_rank_print(f"{position=}", flush=True)
-    first_rank_print(f"{retrive_index=}", flush=True)
-    first_rank_print(f"{retrive_cum_len=}", flush=True)
-    first_rank_print(f"{draft_tokens=}", flush=True)
-    assert position.tolist() == [5, 6, 6, 7, 7, 8, 8, 9, 10, 11, 12, 12, 12, 12, 13, 14]
-    assert retrive_index.tolist() == [
-        [0, -1, -1, -1, -1, -1],
-        [0, 2, 4, 6, -1, -1],
-        [0, 1, 3, 5, 7, -1],
-        [8, -1, -1, -1, -1, -1],
-        [8, 9, 10, -1, -1, -1],
-        [8, 9, 12, -1, -1, -1],
-        [8, 9, 13, -1, -1, -1],
-        [8, 9, 11, 14, 15, -1],
-    ]
-    assert retrive_cum_len.tolist() == [0, 3, 8]
-    assert draft_tokens.tolist() == [
-        29974,
-        29896,
-        29906,
-        29889,
-        29974,
-        29946,
-        29896,
-        29946,
-        13,
-        13,
-        22550,
-        4136,
-        16492,
-        8439,
-        29871,
-        29941,
-    ]
     (
         tree_mask,
         position,
@@ -725,4 +386,3 @@ def test_build_tree_kernel_efficient():
 if __name__ == "__main__":
     test_build_tree_kernel_efficient()
-    test_build_tree_kernel()

sglang 0.4.4__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl

sglang 0.4.4py3-none-any.whl → 0.4.4.post2py3-none-any.whl