PyPI - sglang - Versions diffs - 0.4.2.post3__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

sglang 0.4.2.post3py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -255,6 +255,8 @@ class DeepseekV2Attention(nn.Module):
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=False,
             quant_config=quant_config,
+            # FIXME: quick fix for skip quantization
+            prefix=f"self_attn.kv_a_proj_with_mqa",
         )
         self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = ColumnParallelLinear(
@@ -455,6 +457,8 @@ class DeepseekV2AttentionMLA(nn.Module):
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=False,
             quant_config=quant_config,
+            # FIXME: quick fix for skip quantization
+            prefix=f"self_attn.kv_a_proj_with_mqa",
         )
         self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
@@ -506,14 +510,20 @@ class DeepseekV2AttentionMLA(nn.Module):
         hidden_states: torch.Tensor,
         forward_batch: ForwardBatch,
     ) -> torch.Tensor:
-        # Use normal computation for prefill and use weight absorption for extend/decode
-        if (
-            forward_batch.forward_mode.is_extend()
-            and forward_batch.extend_prefix_lens.sum() == 0
-        ):
-            return self.forward_normal(positions, hidden_states, forward_batch)
+        if global_server_args_dict["enable_flashinfer_mla"]:
+            if forward_batch.forward_mode.is_extend():
+                return self.forward_normal(positions, hidden_states, forward_batch)
+            else:
+                return self.forward_absorb(positions, hidden_states, forward_batch)
         else:
-            return self.forward_absorb(positions, hidden_states, forward_batch)
+            # Triton: Use normal computation for prefill and use weight absorption for extend/decode
+            if (
+                forward_batch.forward_mode.is_extend()
+                and forward_batch.extend_prefix_lens.sum() == 0
+            ):
+                return self.forward_normal(positions, hidden_states, forward_batch)
+            else:
+                return self.forward_absorb(positions, hidden_states, forward_batch)
     def forward_normal(
         self,

sglang/srt/server_args.py CHANGED Viewed

@@ -140,6 +140,7 @@ class ServerArgs:
     disable_jump_forward: bool = False
     disable_cuda_graph: bool = False
     disable_cuda_graph_padding: bool = False
+    enable_nccl_nvls: bool = False
     disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
     disable_mla: bool = False
@@ -160,12 +161,15 @@ class ServerArgs:
     delete_ckpt_after_loading: bool = False
     enable_memory_saver: bool = False
     allow_auto_truncate: bool = False
+    return_hidden_states: bool = False
     # Custom logit processor
     enable_custom_logit_processor: bool = False
     tool_call_parser: str = None
     enable_hierarchical_cache: bool = False
+    enable_flashinfer_mla: bool = False
     def __post_init__(self):
         # Set missing default values
         if self.tokenizer_path is None:
@@ -691,6 +695,11 @@ class ServerArgs:
             default=ServerArgs.grammar_backend,
             help="Choose the backend for grammar-guided decoding.",
         )
+        parser.add_argument(
+            "--enable-flashinfer-mla",
+            action="store_true",
+            help="Enable FlashInfer MLA optimization",
+        )
         # Speculative decoding
         parser.add_argument(
@@ -782,6 +791,11 @@ class ServerArgs:
             action="store_true",
             help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
         )
+        parser.add_argument(
+            "--enable-nccl-nvls",
+            action="store_true",
+            help="Enable NCCL NVLS for prefill heavy requests when available.",
+        )
         parser.add_argument(
             "--disable-outlines-disk-cache",
             action="store_true",
@@ -795,7 +809,7 @@ class ServerArgs:
         parser.add_argument(
             "--disable-mla",
             action="store_true",
-            help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
+            help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
         )
         parser.add_argument(
             "--disable-overlap-schedule",
@@ -896,6 +910,11 @@ class ServerArgs:
             action="store_true",
             help="Enable users to pass custom logit processors to the server (disabled by default for security)",
         )
+        parser.add_argument(
+            "--return-hidden-states",
+            action="store_true",
+            help="Return hidden states in the response.",
+        )
         # Function Calling
         parser.add_argument(
             "--tool-call-parser",

sglang/srt/speculative/eagle_draft_cuda_graph_runner.py CHANGED Viewed

@@ -85,6 +85,7 @@ class EAGLEDraftCudaGraphRunner:
                 "1. disable cuda graph by --disable-cuda-graph\n"
                 "2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
                 "3. disable torch compile by not using --enable-torch-compile\n"
+                "4. specify --dtype to the same dtype (e.g. bfloat16)\n"
                 "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
             )

sglang/srt/speculative/eagle_utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ import dataclasses
 from typing import TYPE_CHECKING, List
 import torch
+import torch.nn.functional as F
 import triton
 import triton.language as tl
@@ -11,7 +12,14 @@ from sglang.srt.layers.attention.flashinfer_backend import (
     create_flashinfer_kv_indices_triton,
 )
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
-from sglang.srt.speculative.build_eagle_tree import build_tree_kernel
+from sglang.srt.speculative.build_eagle_tree import (
+    build_tree_kernel,
+    build_tree_kernel_efficient,
+)
+from sglang.srt.utils import is_cuda_available
+if is_cuda_available():
+    from sgl_kernel import tree_speculative_sampling_target_only
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import ScheduleBatch
@@ -160,8 +168,11 @@ class EagleVerifyInput:
     custom_mask: torch.Tensor
     positions: torch.Tensor
     retrive_index: torch.Tensor
+    retrive_next_token: torch.Tensor
+    retrive_next_sibling: torch.Tensor
     retrive_cum_len: torch.Tensor
     draft_token_num: int
+    spec_steps: int
     capture_hidden_mode: CaptureHiddenMode
     @classmethod
@@ -175,10 +186,45 @@ class EagleVerifyInput:
         seq_lens_sum: int,
         topk: int,
         spec_steps: int,
-        num_verify_token: int,
+        num_verify_tokens: int,
+        is_all_greedy: bool,
     ):
-        tree_mask, position, retrive_index, retrive_cum_len, draft_tokens = (
-            build_tree_kernel(
+        if is_all_greedy:
+            tree_mask, position, retrive_index, retrive_cum_len, draft_tokens = (
+                build_tree_kernel(
+                    verified_id,
+                    score_list,  # b, n, topk; n= 1 + (num_steps-1) * self.topk
+                    token_list,
+                    parents_list,
+                    seq_lens,
+                    seq_lens_sum,
+                    topk,
+                    spec_steps,
+                    num_verify_tokens,
+                )
+            )
+            return cls(
+                draft_tokens,
+                tree_mask,
+                position,
+                retrive_index,
+                None,
+                None,
+                retrive_cum_len,
+                num_verify_tokens,
+                spec_steps,
+                CaptureHiddenMode.FULL,
+            )
+        else:
+            (
+                tree_mask,
+                position,
+                retrive_index,
+                retrive_next_token,
+                retrive_next_sibling,
+                draft_tokens,
+            ) = build_tree_kernel_efficient(
                 verified_id,
                 score_list,
                 token_list,
@@ -187,18 +233,21 @@ class EagleVerifyInput:
                 seq_lens_sum,
                 topk,
                 spec_steps,
-                num_verify_token,
+                num_verify_tokens,
+            )
+            return cls(
+                draft_tokens,
+                tree_mask,
+                position,
+                retrive_index,
+                retrive_next_token,
+                retrive_next_sibling,
+                None,
+                num_verify_tokens,
+                spec_steps,
+                CaptureHiddenMode.FULL,
             )
-        )
-        return cls(
-            draft_tokens,
-            tree_mask,
-            position,
-            retrive_index,
-            retrive_cum_len,
-            num_verify_token,
-            CaptureHiddenMode.FULL,
-        )
     def prepare_for_verify(self, batch: ScheduleBatch):
         batch.input_ids = self.draft_token
@@ -313,12 +362,6 @@ class EagleVerifyInput:
                 uniform_samples=coins,
                 target_probs=target_probs,
                 draft_probs=draft_probs,
-                threshold_single=global_server_args_dict[
-                    "speculative_accept_threshold_single"
-                ],
-                threshold_acc=global_server_args_dict[
-                    "speculative_accept_threshold_acc"
-                ],
                 deterministic=True,
             )

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -65,15 +65,31 @@ class EAGLEWorker(TpModelWorker):
         self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
         # Create multi-step attn backends and cuda graph runners
-        from sglang.srt.layers.attention.flashinfer_backend import (
-            FlashInferMultiStepDraftBackend,
-        )
+        if server_args.attention_backend == "flashinfer":
+            from sglang.srt.layers.attention.flashinfer_backend import (
+                FlashInferMultiStepDraftBackend,
+            )
+            self.draft_attn_backend = FlashInferMultiStepDraftBackend(
+                self.model_runner,
+                self.topk,
+                self.speculative_num_steps,
+            )
+        elif server_args.attention_backend == "triton":
+            from sglang.srt.layers.attention.triton_backend import (
+                TritonMultiStepDraftBackend,
+            )
+            self.draft_attn_backend = TritonMultiStepDraftBackend(
+                self.model_runner,
+                self.topk,
+                self.speculative_num_steps,
+            )
+        else:
+            raise ValueError(
+                f"EAGLE is not supportted in attention backend {server_args.attention_backend}"
+            )
-        self.draft_attn_backend = FlashInferMultiStepDraftBackend(
-            self.model_runner,
-            self.topk,
-            self.speculative_num_steps,
-        )
         self.model_runner.draft_attn_backend = self.draft_attn_backend
         self.init_cuda_graphs()
@@ -185,6 +201,7 @@ class EAGLEWorker(TpModelWorker):
             self.topk,
             self.speculative_num_steps,
             self.server_args.speculative_num_draft_tokens,
+            batch.sampling_info.is_all_greedy,
         )
         # Free cache locations
@@ -217,6 +234,10 @@ class EAGLEWorker(TpModelWorker):
             token_list.append(tree_info[1])
             parents_list.append(tree_info[2])
+            # we don't need to run the last forward. we get 1 token from draft prefill and (#spec steps - 1) tokens here
+            if i == self.speculative_num_steps - 1:
+                break
             # Set inputs
             forward_batch.input_ids = input_ids
             forward_batch.out_cache_loc = out_cache_loc[

sglang/srt/utils.py CHANGED Viewed

@@ -1444,3 +1444,10 @@ def launch_dummy_health_check_server(host, port):
         timeout_keep_alive=5,
         loop="uvloop",
     )
+def set_cuda_arch():
+    if is_flashinfer_available():
+        capability = torch.cuda.get_device_capability()
+        arch = f"{capability[0]}.{capability[1]}"
+        os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.~~2.post3~~"
1	+ __version__ = "0.4.3"

{sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: sglang
-Version: 0.4.2.post3
+Version: 0.4.3
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -236,14 +236,15 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
+Requires-Dist: ninja; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: sgl-kernel>=0.0.3.post2; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.3.post6; extra == "srt"
 Requires-Dist: torch; extra == "srt"
-Requires-Dist: vllm==0.6.4.post1; extra == "srt"
-Requires-Dist: flashinfer_python>=0.2.0.post2; extra == "srt"
-Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt"
+Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
+Requires-Dist: flashinfer_python>=0.2.1.post1; extra == "srt"
+Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"

sglang 0.4.2.post3__py3-none-any.whl → 0.4.3__py3-none-any.whl

sglang 0.4.2.post3py3-none-any.whl → 0.4.3py3-none-any.whl