PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

sglang/srt/operations.py CHANGED Viewed

@@ -12,7 +12,7 @@ if _ENABLE_PROFILE:
 def execute_operations(inputs, operations):
-    stages = _convert_operations_to_stages(decorate_operations(operations))
+    stages = _convert_operations_to_stages(operations)
     executor = _StageExecutor("primary", stages, inputs=inputs)
     for _ in range(executor.num_stages):
         executor.next()
@@ -20,6 +20,37 @@ def execute_operations(inputs, operations):
     return executor.output
+def execute_overlapped_operations(
+    inputs_arr: Sequence,
+    operations_arr: Sequence,
+    delta_stages: Sequence[int],
+) -> Sequence:
+    # Make it explicit for clarity; if we need multi-batch overlap, this can be generalized
+    inputs_a, inputs_b = inputs_arr
+    operations_a, operations_b = operations_arr
+    delta_stage_a, delta_stage_b = delta_stages
+    assert delta_stage_a == 0
+    delta_stage = delta_stage_b
+    stages_a = _convert_operations_to_stages(operations_a)
+    stages_b = _convert_operations_to_stages(operations_b)
+    executor_a = _StageExecutor("a", stages_a, inputs=inputs_a)
+    executor_b = _StageExecutor("b", stages_b, inputs=inputs_b)
+    for _ in range(delta_stage):
+        executor_a.next()
+    for _ in range(executor_a.num_stages - delta_stage):
+        executor_a.next()
+        executor_b.next()
+    for _ in range(delta_stage):
+        executor_b.next()
+    assert executor_a.done and executor_b.done
+    return [executor_a.output, executor_b.output]
 class YieldOperation:
     pass
@@ -109,6 +140,9 @@ class _StateDict:
         for k, v in values.items():
             setattr(self, k, v)
+    def get(self, item):
+        return self._data.get(item)
     def clear(self, expect_keys: Sequence[str]):
         if set(self._data.keys()) != set(expect_keys):
             raise Exception(
@@ -119,6 +153,7 @@ class _StateDict:
 def _convert_operations_to_stages(operations: List[Operation]) -> List[Stage]:
+    operations = _decorate_operations(operations)
     operation_chunks = list(
         _chunk_by_separator(operations, lambda op: isinstance(op, YieldOperation))
     )
@@ -140,7 +175,7 @@ def _chunk_by_separator(
         yield pending_items
-def decorate_operations(operations: List[Operation], debug_name_prefix: str = ""):
+def _decorate_operations(operations: List[Operation], debug_name_prefix: str = ""):
     return [_decorate_operation(op, debug_name_prefix) for op in operations]

sglang/srt/operations_strategy.py CHANGED Viewed

@@ -1,31 +1,207 @@
+from dataclasses import dataclass
+from typing import List, Optional
 import torch
+from sglang.srt import operations
+from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.operations import Operation
+@dataclass
+class OperationsStrategy:
+    operations: List[Operation]
+    deep_gemm_num_sms: Optional[int] = None
+    tbo_delta_stages: Optional[int] = None
+    @classmethod
+    def concat(cls, items: List["OperationsStrategy"]) -> "OperationsStrategy":
+        return OperationsStrategy(
+            operations=[x for item in items for x in item.operations],
+            deep_gemm_num_sms=_assert_all_same(
+                [item.deep_gemm_num_sms for item in items]
+            ),
+            tbo_delta_stages=_assert_all_same(
+                [item.tbo_delta_stages for item in items]
+            ),
+        )
+    @staticmethod
+    def init_new_tbo(
+        layers: torch.nn.ModuleList,
+        forward_mode: ForwardMode,
+    ) -> "OperationsStrategy":
+        layer_name = layers[0].__class__.__name__
+        if layer_name == "DeepseekV2DecoderLayer":
+            return OperationsStrategy.concat(
+                [
+                    _compute_moe_deepseek_layer_operations_strategy_tbo(
+                        layer, forward_mode
+                    )
+                    for layer in layers
+                ]
+            )
+        elif layer_name == "Qwen3MoeDecoderLayer":
+            return OperationsStrategy.concat(
+                [
+                    _compute_moe_qwen3_layer_operations_strategy_tbo(
+                        layer, forward_mode
+                    )
+                    for layer in layers
+                ]
+            )
+        else:
+            raise NotImplementedError
+def _assert_all_same(items: List):
+    assert all(item == items[0] for item in items)
+    return items[0]
+# -------------------------------- Strategy for DeepSeek ---------------------------------------
+# TODO can refactor to make it more fancy if we have more complex strategies
+def _compute_moe_deepseek_layer_operations_strategy_tbo(
+    layer: torch.nn.Module,
+    forward_mode: ForwardMode,
+) -> OperationsStrategy:
+    assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
+    if forward_mode == ForwardMode.EXTEND:
+        return _compute_moe_deepseek_blog_prefill(layer)
+    elif forward_mode == ForwardMode.DECODE:
+        return _compute_moe_deepseek_blog_decode(layer)
+    else:
+        raise NotImplementedError(f"Unsupported {forward_mode=}")
+def _compute_moe_deepseek_blog_prefill(layer):
+    device_properties = torch.cuda.get_device_properties(device="cuda")
+    total_num_sms = device_properties.multi_processor_count
+    deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
+    return OperationsStrategy(
+        deep_gemm_num_sms=deep_gemm_num_sms,
+        tbo_delta_stages=0,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_shared_experts,
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+def _compute_moe_deepseek_blog_decode(layer):
+    return OperationsStrategy(
+        deep_gemm_num_sms=None,
+        tbo_delta_stages=2,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            operations.YieldOperation(),
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_a,
+            layer.mlp.op_shared_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            operations.YieldOperation(),
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+# -------------------------------- Strategy for Qwen3 ---------------------------------------
-def compute_layer_operations(
+# TODO: unstable, current strategy is almost the same as DeepSeek, keep redundant code here for
+# convenience to adjust strategy
+def _compute_moe_qwen3_layer_operations_strategy_tbo(
     layer: torch.nn.Module,
-):
-    if not layer.is_layer_sparse:
-        return [
+    forward_mode: ForwardMode,
+) -> OperationsStrategy:
+    assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
+    if forward_mode == ForwardMode.EXTEND:
+        return _compute_moe_qwen3_prefill(layer)
+    elif forward_mode == ForwardMode.DECODE:
+        return _compute_moe_qwen3_decode(layer)
+    else:
+        raise NotImplementedError(f"Unsupported {forward_mode=}")
+def _compute_moe_qwen3_prefill(layer):
+    device_properties = torch.cuda.get_device_properties(device="cuda")
+    total_num_sms = device_properties.multi_processor_count
+    deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
+    return OperationsStrategy(
+        deep_gemm_num_sms=deep_gemm_num_sms,
+        tbo_delta_stages=0,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+def _compute_moe_qwen3_decode(layer):
+    return OperationsStrategy(
+        deep_gemm_num_sms=None,
+        tbo_delta_stages=2,
+        operations=[
             layer.op_comm_prepare_attn,
-            layer.op_attn,
+            layer.self_attn.op_prepare,
+            operations.YieldOperation(),
+            layer.self_attn.op_core,
             layer.op_comm_prepare_mlp,
-            layer.op_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
             layer.op_comm_postprocess_layer,
-        ]
-    # Will add TBO operation orders here
-    return [
-        layer.op_comm_prepare_attn,
-        layer.op_attn,
-        layer.op_comm_prepare_mlp,
-        layer.mlp.op_gate,
-        layer.mlp.op_shared_experts,
-        layer.mlp.op_select_experts,
-        layer.mlp.op_dispatch_a,
-        layer.mlp.op_dispatch_b,
-        layer.mlp.op_experts,
-        layer.mlp.op_combine_a,
-        layer.mlp.op_combine_b,
-        layer.mlp.op_output,
-        layer.op_comm_postprocess_layer,
-    ]
+            operations.YieldOperation(),
+        ],
+    )

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -9,10 +9,12 @@ import torch
 import sglang.srt.sampling.penaltylib as penaltylib
 from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+from sglang.srt.sampling.sampling_params import TOP_K_ALL
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import ScheduleBatch
 logger = logging.getLogger(__name__)
@@ -27,6 +29,12 @@ class SamplingBatchInfo:
     # Whether all requests use greedy sampling
     is_all_greedy: bool
+    # Whether any requests use top_p sampling
+    need_top_p_sampling: bool
+    # Whether any requests use top_k sampling
+    need_top_k_sampling: bool
     # Whether any request needs min_p sampling
     need_min_p_sampling: bool
@@ -133,6 +141,8 @@ class SamplingBatchInfo:
             top_ks=top_ks,
             min_ps=min_ps,
             is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
+            need_top_p_sampling=any(r.sampling_params.top_p != 1.0 for r in reqs),
+            need_top_k_sampling=any(r.sampling_params.top_k != TOP_K_ALL for r in reqs),
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
             vocab_size=vocab_size,
             penalizer_orchestrator=penalizer_orchestrator,
@@ -167,7 +177,7 @@ class SamplingBatchInfo:
         # Apply the mask
         for i, grammar in enumerate(self.grammars):
-            if grammar and not grammar.finished:
+            if grammar and not grammar.finished and not grammar.is_terminated():
                 grammar.fill_vocab_mask(self.vocab_mask, i)
         # Move the mask to the device if needed
@@ -308,4 +318,6 @@ class SamplingBatchInfo:
             setattr(self, item, torch.cat([self_val, other_val]))
         self.is_all_greedy &= other.is_all_greedy
+        self.need_top_p_sampling |= other.need_top_p_sampling
+        self.need_top_k_sampling |= other.need_top_k_sampling
         self.need_min_p_sampling |= other.need_min_p_sampling

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -16,6 +16,7 @@
 from typing import Any, Dict, List, Optional, Union
 _SAMPLING_EPS = 1e-6
+TOP_K_ALL = 1 << 30
 class SamplingParams:
@@ -84,7 +85,7 @@ class SamplingParams:
             self.temperature = 1.0
             self.top_k = 1
         if self.top_k == -1:
-            self.top_k = 1 << 30  # whole vocabulary
+            self.top_k = TOP_K_ALL  # whole vocabulary
     def verify(self):
         if self.temperature < 0.0:

sglang/srt/server_args.py CHANGED Viewed

@@ -28,6 +28,7 @@ from sglang.srt.utils import (
     configure_ipv6,
     get_device,
     get_device_memory_capacity,
+    is_cuda,
     is_flashinfer_available,
     is_hip,
     is_port_available,
@@ -60,6 +61,7 @@ class ServerArgs:
     is_embedding: bool = False
     enable_multimodal: Optional[bool] = None
     revision: Optional[str] = None
+    impl: str = "auto"
     # Port for the HTTP server
     host: str = "127.0.0.1"
@@ -163,20 +165,24 @@ class ServerArgs:
     enable_tokenizer_batch_encode: bool = False
     disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
+    enable_mscclpp: bool = False
     disable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
     enable_dp_attention: bool = False
     enable_dp_lm_head: bool = False
+    enable_two_batch_overlap: bool = False
     enable_ep_moe: bool = False
     enable_deepep_moe: bool = False
     deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
     ep_num_redundant_experts: int = 0
-    ep_dispatch_algorithm: Optional[Literal["static", "dynamic"]] = None
+    ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
     init_expert_location: str = "trivial"
     enable_eplb: bool = False
+    eplb_algorithm: str = "auto"
     eplb_rebalance_num_iterations: int = 1000
+    eplb_rebalance_layers_per_chunk: Optional[int] = None
     expert_distribution_recorder_mode: Optional[
-        Literal["stat", "per_pass", "per_token"]
+        Literal["stat", "stat_approx", "per_pass", "per_token"]
     ] = None
     expert_distribution_recorder_buffer_size: Optional[int] = None
     enable_expert_distribution_metrics: bool = False
@@ -203,7 +209,7 @@ class ServerArgs:
     flashinfer_mla_disable_ragged: bool = False
     warmups: Optional[str] = None
     moe_dense_tp_size: Optional[int] = None
-    n_share_experts_fusion: int = 0
+    disable_shared_experts_fusion: bool = False
     disable_chunked_prefix_cache: bool = False
     disable_fast_image_processor: bool = False
     mm_attention_backend: Optional[str] = None
@@ -259,17 +265,28 @@ class ServerArgs:
                     self.mem_fraction_static = 0.88
             else:
                 self.mem_fraction_static = 0.88
-            if gpu_mem is not None and gpu_mem > 96 * 1024:
+            if gpu_mem is not None and gpu_mem > 180 * 1000 and is_cuda():
+                self.mem_fraction_static = 0.79
+            elif gpu_mem is not None and gpu_mem > 96 * 1024:
                 mem_fraction = self.mem_fraction_static
+                # 15 GB + additional 3GB for cuda graph
+                reserve_mem = 1024 * 18
+                # need reserve more memory for spec cuda graph
+                if self.speculative_algorithm is not None:
+                    reserve_mem = 1024 * 20
                 self.mem_fraction_static = min(
                     mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
-                    (gpu_mem - 1024 * 18)
-                    / gpu_mem,  # 15 GB + additional 3GB for cuda graph
+                    (gpu_mem - reserve_mem) / gpu_mem,
                 )
+            else:
+                if self.speculative_algorithm is not None:
+                    self.mem_fraction_static *= 0.95
         # Set chunked prefill size, which depends on the gpu memory capacity
         if self.chunked_prefill_size is None:
-            if gpu_mem is not None and gpu_mem < 25_000:
+            if gpu_mem is not None and gpu_mem > 180_000:
+                self.chunked_prefill_size = 16384
+            elif gpu_mem is not None and gpu_mem < 25_000:
                 self.chunked_prefill_size = 2048
             elif self.disaggregation_mode != "null":
                 self.chunked_prefill_size = 16384
@@ -309,6 +326,11 @@ class ServerArgs:
             self.sampling_backend = "pytorch"
         # Set kernel backends
+        if self.device == "cpu":
+            if self.attention_backend is None:
+                self.attention_backend = "intel_amx"
+            self.sampling_backend = "pytorch"
         if self.sampling_backend is None:
             self.sampling_backend = (
                 "flashinfer" if is_flashinfer_available() else "pytorch"
@@ -365,12 +387,28 @@ class ServerArgs:
                 "Pipeline parallelism is incompatible with overlap schedule."
             )
+        if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
+            self.expert_distribution_recorder_mode = "stat"
+            logger.info(
+                f"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
+            )
+        if (self.enable_eplb or (self.init_expert_location is not None)) and (
+            self.ep_dispatch_algorithm is None
+        ):
+            self.ep_dispatch_algorithm = "static"
+            logger.info(
+                f"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
+            )
+        if self.enable_expert_distribution_metrics and (
+            self.expert_distribution_recorder_mode is None
+        ):
+            self.expert_distribution_recorder_mode = "stat"
         if self.expert_distribution_recorder_buffer_size is None:
-            # TODO pr-chain: enable this later
-            # if (x := self.eplb_rebalance_num_iterations) is not None:
-            #     self.expert_distribution_recorder_buffer_size = x
-            if False:
-                pass
+            if (x := self.eplb_rebalance_num_iterations) is not None:
+                self.expert_distribution_recorder_buffer_size = x
             elif self.expert_distribution_recorder_mode is not None:
                 self.expert_distribution_recorder_buffer_size = 1000
@@ -387,6 +425,12 @@ class ServerArgs:
                 "Overlap scheduler is disabled because of using "
                 "eagle speculative decoding."
             )
+            if self.enable_mixed_chunk:
+                self.enable_mixed_chunk = False
+                logger.warning(
+                    "Mixed chunked prefill is disabled because of using "
+                    "eagle speculative decoding."
+                )
             model_arch = get_model_arch(self)
@@ -409,7 +453,7 @@ class ServerArgs:
                     self.speculative_num_steps,
                     self.speculative_eagle_topk,
                     self.speculative_num_draft_tokens,
-                ) = auto_choose_speculative_params(model_arch)
+                ) = auto_choose_speculative_params(self)
             if self.page_size > 1 and self.speculative_eagle_topk > 1:
                 self.speculative_eagle_topk = 1
@@ -691,6 +735,18 @@ class ServerArgs:
             default=ServerArgs.page_size,
             help="The number of tokens in a page.",
         )
+        parser.add_argument(
+            "--impl",
+            type=str,
+            default=ServerArgs.impl,
+            help="Which implementation of the model to use.\n\n"
+            '* "auto" will try to use the SGLang implementation if it exists '
+            "and fall back to the Transformers implementation if no SGLang "
+            "implementation is available.\n"
+            '* "sglang" will use the SGLang model implementation.\n'
+            '* "transformers" will use the Transformers model '
+            "implementation.\n",
+        )
         # Other runtime options
         parser.add_argument(
@@ -957,12 +1013,13 @@ class ServerArgs:
             type=str,
             choices=[
                 "aiter",
-                "flashinfer",
-                "triton",
-                "torch_native",
+                "cutlass_mla",
                 "fa3",
+                "flashinfer",
                 "flashmla",
-                "cutlass_mla",
+                "intel_amx",
+                "torch_native",
+                "triton",
             ],
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
@@ -1119,6 +1176,11 @@ class ServerArgs:
             action="store_true",
             help="Disable the custom all-reduce kernel and fall back to NCCL.",
         )
+        parser.add_argument(
+            "--enable-mscclpp",
+            action="store_true",
+            help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
+        )
         parser.add_argument(
             "--disable-overlap-schedule",
             action="store_true",
@@ -1144,6 +1206,11 @@ class ServerArgs:
             action="store_true",
             help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
         )
+        parser.add_argument(
+            "--enable-two-batch-overlap",
+            action="store_true",
+            help="Enabling two micro batches to overlap.",
+        )
         parser.add_argument(
             "--enable-torch-compile",
             action="store_true",
@@ -1295,12 +1362,24 @@ class ServerArgs:
             action="store_true",
             help="Enable EPLB algorithm",
         )
+        parser.add_argument(
+            "--eplb-algorithm",
+            type=str,
+            default=ServerArgs.eplb_algorithm,
+            help="Chosen EPLB algorithm",
+        )
         parser.add_argument(
             "--eplb-rebalance-num-iterations",
             type=int,
             default=ServerArgs.eplb_rebalance_num_iterations,
             help="Number of iterations to automatically trigger a EPLB re-balance.",
         )
+        parser.add_argument(
+            "--eplb-rebalance-layers-per-chunk",
+            type=int,
+            default=ServerArgs.eplb_rebalance_layers_per_chunk,
+            help="Number of layers to rebalance per forward pass.",
+        )
         parser.add_argument(
             "--expert-distribution-recorder-mode",
             type=str,
@@ -1322,15 +1401,12 @@ class ServerArgs:
             "--deepep-config",
             type=str,
             default=ServerArgs.deepep_config,
-            help="Tuned DeepEP config suitable for your own cluster.",
+            help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
         )
         parser.add_argument(
-            "--n-share-experts-fusion",
-            type=int,
-            default=0,
-            help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
-            "set it to tp_size can get best optimized performance. Note that for architectures with SM==90, we have enabled the shared experts fusion optimization by default for DeepSeek V3/R1, with n_share_experts_fusion automatically set to the TP size.",
+            "--disable-shared-experts-fusion",
+            action="store_true",
+            help="Disable shared experts fusion optimization for deepseek v3/r1.",
         )
         parser.add_argument(
             "--disable-chunked-prefix-cache",
@@ -1451,7 +1527,7 @@ class ServerArgs:
             self.max_loras_per_batch > 0
             # FIXME
             and (self.lora_paths is None or self.disable_radix_cache)
-        ), "compatibility of lora and cuda graph and radix attention is in progress"
+        ), "compatibility of lora and radix attention is in progress"
         assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
         assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
@@ -1585,18 +1661,29 @@ def get_model_arch(args: ServerArgs):
     return hf_config.architectures[0]
-def auto_choose_speculative_params(arch: str):
+def auto_choose_speculative_params(self: ServerArgs):
     """
     Automatically choose the parameters for speculative decoding.
     You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
     """
+    kwargs = {}
+    hf_config = get_config(
+        self.model_path,
+        trust_remote_code=self.trust_remote_code,
+        revision=self.revision,
+        model_override_args=json.loads(self.json_model_override_args),
+        **kwargs,
+    )
+    arch = hf_config.architectures[0]
     if arch in ["LlamaForCausalLM"]:
         # The default value for llama
         return (5, 4, 8)
     elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
         # The default value for deepseek
-        return (5, 4, 8)
+        return (3, 1, 4)
     elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
         return (5, 4, 8)
     else:

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl