PyPI - sglang - Versions diffs - 0.4.6__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl - Mend

sglang 0.4.6py3-none-any.whl → 0.4.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -72,8 +72,8 @@ _is_hip = is_hip()
 _is_cuda = is_cuda()
 if _is_hip:
-    from aiter import ActivationType
-    from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages, ck_moe_2stages_win4
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages
     from aiter.ops.shuffle import shuffle_weight
 if not _is_cuda:
@@ -484,7 +484,7 @@ class Fp8MoEMethod:
         if self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = (
                 torch.uint32
-                if get_bool_env_var("USE_INT4_WEIGHT")
+                if get_bool_env_var("SGLANG_INT4_WEIGHT")
                 else torch.float8_e4m3fn
             )
         tp_size = get_tensor_model_parallel_world_size()
@@ -511,7 +511,7 @@ class Fp8MoEMethod:
                     )
         # WEIGHTS
-        if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+        if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
             # INT4 MoE weight - INT32 packed
             w13_weight = torch.nn.Parameter(
                 torch.empty(
@@ -585,7 +585,7 @@ class Fp8MoEMethod:
             if (
                 _is_hip
-            ):  # and get_bool_env_var("CK_MOE"): TODO: add check back after triton kernel
+            ):  # and get_bool_env_var("SGLANG_AITER_MOE"): TODO: add check back after triton kernel
                 # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
                 w13_weight_scale1 = torch.nn.Parameter(
                     torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
@@ -612,7 +612,7 @@ class Fp8MoEMethod:
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-            if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+            if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
                 extra_weight_attrs.update(
                     {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
                 )
@@ -644,7 +644,7 @@ class Fp8MoEMethod:
             layer.w2_input_scale = None
     def process_weights_after_loading(self, layer: Module) -> None:
-        if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+        if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
             self.process_weights_hip_int4(layer)
             return
@@ -675,7 +675,7 @@ class Fp8MoEMethod:
                 )
                 layer.w2_input_scale = None
-                if get_bool_env_var("CK_MOE"):
+                if get_bool_env_var("SGLANG_AITER_MOE"):
                     # Pre-shuffle weights
                     layer.w13_weight.data = shuffle_weight(
                         layer.w13_weight.contiguous(), (16, 16)
@@ -798,17 +798,15 @@ class Fp8MoEMethod:
             return
     def process_weights_hip_int4(self, layer: Module):
-        # TODO: and get_bool_env_var("CK_MOE"): add after triton kernel added
+        # TODO: and get_bool_env_var("SGLANG_AITER_MOE"): add after triton kernel added
         # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
         # Weight Permutation
         layer.w13_weight = torch.nn.Parameter(
-            # permute_weight(layer.w13_weight.data),
             shuffle_weight(layer.w13_weight.data, (16, 16)),
             requires_grad=False,
         )
         torch.cuda.empty_cache()
         layer.w2_weight = torch.nn.Parameter(
-            # permute_weight(layer.w2_weight.data),
             shuffle_weight(layer.w2_weight.data, (16, 16)),
             requires_grad=False,
         )
@@ -847,23 +845,21 @@ class Fp8MoEMethod:
             padding_size,  # Avoid circular import
         )
-        if get_bool_env_var("CK_MOE"):
+        if get_bool_env_var("SGLANG_AITER_MOE"):
             layer.w13_weight = torch.nn.Parameter(
-                # permute_weight(layer.w13_weight.data),
                 shuffle_weight(layer.w13_weight.data, (16, 16)),
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
             layer.w2_weight = torch.nn.Parameter(
-                # permute_weight(layer.w2_weight.data),
                 shuffle_weight(layer.w2_weight.data, (16, 16)),
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
-            # ROCm (CK_MOE): using column-wise scaling
+            # ROCm (SGLANG_AITER_MOE): using column-wise scaling
             layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
             layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
-        elif get_bool_env_var("MOE_PADDING"):
+        elif get_bool_env_var("SGLANG_MOE_PADDING"):
             # If ROCm, apply weight padding (min. Mem channel contention) only if set
             layer.w13_weight = torch.nn.Parameter(
                 F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
@@ -912,15 +908,16 @@ class Fp8MoEMethod:
         )
         if _is_hip:
-            if get_bool_env_var("USE_INT4_WEIGHT"):
-                # TODO: add triton kernel and add check get_bool_env_var("CK_MOE")
+            if get_bool_env_var("SGLANG_INT4_WEIGHT"):
+                # TODO: add triton kernel and add check get_bool_env_var("SGLANG_AITER_MOE")
                 assert not no_combine, f"{no_combine=} is not supported."
-                return ck_moe_2stages_win4(
+                return ck_moe_2stages(
                     x,
                     layer.w13_weight,
                     layer.w2_weight,
                     topk_weights,
                     topk_ids,
+                    QuantType.per_Token,
                     layer.w13_weight_scale1,
                     layer.w2_weight_scale1,
                     activation=(
@@ -930,13 +927,13 @@ class Fp8MoEMethod:
                     ),
                 )
-            if get_bool_env_var("CK_MOE"):
+            if get_bool_env_var("SGLANG_AITER_MOE"):
                 assert not no_combine, f"{no_combine=} is not supported."
                 if self.block_quant:
-                    # TODO(CK_MOE): FP8 block_quant only supports 'silu' for the time-being.
+                    # TODO(SGLANG_AITER_MOE): FP8 block_quant only supports 'silu' for the time-being.
                     assert (
                         activation == "silu"
-                    ), f"CK_MOE: FP8 bloack_quant {activation=} will be supported later, unset CK_MOE"
+                    ), f"SGLANG_AITER_MOE: FP8 bloack_quant {activation=} will be supported later, unset SGLANG_AITER_MOE"
                     return asm_moe(
                         x,
                         layer.w13_weight,
@@ -955,6 +952,7 @@ class Fp8MoEMethod:
                         layer.w2_weight,
                         topk_weights,
                         topk_ids,
+                        QuantType.per_Token,
                         layer.w13_weight_scale1,
                         layer.w2_weight_scale1,
                         activation=(

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -31,7 +31,7 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_cuda = is_cuda()
-if _is_hip and get_bool_env_var("CK_MOE"):
+if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
     from aiter import gemm_a8w8_blockscale
 if _is_cuda:
@@ -132,7 +132,7 @@ def apply_w8a8_block_fp8_linear(
         output = fp8_blockwise_scaled_mm(
             q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
         )
-    elif _is_hip and get_bool_env_var("CK_MOE"):
+    elif _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=False
         )

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -35,6 +35,7 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
 import copy
 import dataclasses
 import logging
+import threading
 from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
 import numpy as np
@@ -724,6 +725,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # This is an optimization to reduce the overhead of the prefill check.
     batch_is_full: bool = False
+    # Events
+    launch_done: Optional[threading.Event] = None
     # Sampling info
     sampling_info: SamplingBatchInfo = None
     next_batch_sampling_info: SamplingBatchInfo = None
@@ -1511,6 +1515,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             )
             or global_server_args_dict["attention_backend"] == "flashmla"
             or global_server_args_dict["attention_backend"] == "fa3"
+            or global_server_args_dict["attention_backend"] == "cutlass_mla"
         ):
             seq_lens_cpu = self.seq_lens.cpu()
         else:
@@ -1565,6 +1570,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 )
             ),
             extend_input_logprob_token_ids=self.extend_input_logprob_token_ids,
+            launch_done=self.launch_done,
         )
     def copy(self):
@@ -1647,6 +1653,9 @@ class ModelWorkerBatch:
     # If set, the output of the batch contains the hidden states of the run.
     capture_hidden_mode: CaptureHiddenMode = None
+    # Overlap event
+    launch_done: Optional[threading.Event] = None
 @triton.jit
 def write_req_to_token_pool_triton(

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -248,9 +248,6 @@ class Scheduler(
         if not self.is_generation:
             self.enable_overlap = False
             logger.info("Overlap scheduler is disabled for embedding models.")
-        if self.model_config.is_multimodal:
-            self.enable_overlap = False
-            logger.info("Overlap scheduler is disabled for multimodal models.")
         # Launch a tensor parallel worker
         if self.enable_overlap:
@@ -645,6 +642,7 @@ class Scheduler(
             self.cur_batch = batch
             if batch:
+                batch.launch_done = threading.Event()
                 result = self.run_batch(batch)
                 self.result_queue.append((batch.copy(), result))
@@ -656,7 +654,7 @@ class Scheduler(
                         forward_mode=ForwardMode.DUMMY_FIRST,
                         next_batch_sampling_info=self.tp_worker.cur_sampling_info,
                     )
-                    self.process_batch_result(tmp_batch, None)
+                    self.process_batch_result(tmp_batch, None, batch.launch_done)
             if self.last_batch:
                 # Process the results of the last batch
@@ -664,7 +662,10 @@ class Scheduler(
                 tmp_batch.next_batch_sampling_info = (
                     self.tp_worker.cur_sampling_info if batch else None
                 )
-                self.process_batch_result(tmp_batch, tmp_result)
+                # NOTE: we should use current launched batch's launch_done event Instead of the last batch's
+                self.process_batch_result(
+                    tmp_batch, tmp_result, batch.launch_done if batch else None
+                )
             elif batch is None:
                 # When the server is idle, do self-check and re-init some states
                 self.check_memory()
@@ -1417,14 +1418,15 @@ class Scheduler(
         self,
         batch: ScheduleBatch,
         result: Union[GenerationBatchResult, EmbeddingBatchResult],
+        launch_done: Optional[threading.Event] = None,
     ):
         if batch.forward_mode.is_decode():
-            self.process_batch_result_decode(batch, result)
+            self.process_batch_result_decode(batch, result, launch_done)
         elif batch.forward_mode.is_extend():
-            self.process_batch_result_prefill(batch, result)
+            self.process_batch_result_prefill(batch, result, launch_done)
         elif batch.forward_mode.is_idle():
             if self.enable_overlap:
-                self.tp_worker.resolve_batch_result(result.bid)
+                self.tp_worker.resolve_last_batch_result(launch_done)
                 if batch.next_batch_sampling_info:
                     batch.next_batch_sampling_info.update_regex_vocab_mask()
                     self.current_stream.synchronize()

sglang/srt/managers/scheduler_output_processor_mixin.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import threading
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
@@ -11,6 +12,7 @@ if TYPE_CHECKING:
         EmbeddingBatchResult,
         GenerationBatchResult,
         ScheduleBatch,
+        Scheduler,
     )
@@ -21,9 +23,10 @@ class SchedulerOutputProcessorMixin:
     """
     def process_batch_result_prefill(
-        self,
+        self: Scheduler,
         batch: ScheduleBatch,
         result: Union[GenerationBatchResult, EmbeddingBatchResult],
+        launch_done: Optional[threading.Event] = None,
     ):
         skip_stream_req = None
@@ -43,7 +46,11 @@ class SchedulerOutputProcessorMixin:
             )
             if self.enable_overlap:
-                logits_output, next_token_ids = self.tp_worker.resolve_batch_result(bid)
+                logits_output, next_token_ids = (
+                    self.tp_worker.resolve_last_batch_result(
+                        launch_done,
+                    )
+                )
             else:
                 # Move next_token_ids and logprobs to cpu
                 next_token_ids = next_token_ids.tolist()
@@ -175,9 +182,10 @@ class SchedulerOutputProcessorMixin:
         self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
     def process_batch_result_decode(
-        self,
+        self: Scheduler,
         batch: ScheduleBatch,
         result: GenerationBatchResult,
+        launch_done: Optional[threading.Event] = None,
     ):
         logits_output, next_token_ids, bid = (
             result.logits_output,
@@ -187,7 +195,9 @@ class SchedulerOutputProcessorMixin:
         self.num_generated_tokens += len(batch.reqs)
         if self.enable_overlap:
-            logits_output, next_token_ids = self.tp_worker.resolve_batch_result(bid)
+            logits_output, next_token_ids = self.tp_worker.resolve_last_batch_result(
+                launch_done
+            )
             next_token_logprobs = logits_output.next_token_logprobs
         elif batch.spec_algorithm.is_none():
             # spec decoding handles output logprobs inside verify process.
@@ -271,7 +281,7 @@ class SchedulerOutputProcessorMixin:
             self.log_decode_stats()
     def add_input_logprob_return_values(
-        self,
+        self: Scheduler,
         i: int,
         req: Req,
         output: LogitsProcessorOutput,
@@ -405,7 +415,7 @@ class SchedulerOutputProcessorMixin:
                     assert len(req.input_token_ids_logprobs_idx) == relevant_tokens_len
     def add_logprob_return_values(
-        self,
+        self: Scheduler,
         i: int,
         req: Req,
         pt: int,
@@ -436,7 +446,10 @@ class SchedulerOutputProcessorMixin:
         return num_input_logprobs
     def stream_output(
-        self, reqs: List[Req], return_logprob: bool, skip_req: Optional[Req] = None
+        self: Scheduler,
+        reqs: List[Req],
+        return_logprob: bool,
+        skip_req: Optional[Req] = None,
     ):
         """Stream the output to detokenizer."""
         if self.is_generation:
@@ -445,7 +458,10 @@ class SchedulerOutputProcessorMixin:
             self.stream_output_embedding(reqs)
     def stream_output_generation(
-        self, reqs: List[Req], return_logprob: bool, skip_req: Optional[Req] = None
+        self: Scheduler,
+        reqs: List[Req],
+        return_logprob: bool,
+        skip_req: Optional[Req] = None,
     ):
         rids = []
         finished_reasons: List[BaseFinishReason] = []
@@ -593,7 +609,7 @@ class SchedulerOutputProcessorMixin:
                 )
             )
-    def stream_output_embedding(self, reqs: List[Req]):
+    def stream_output_embedding(self: Scheduler, reqs: List[Req]):
         rids = []
         finished_reasons: List[BaseFinishReason] = []

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -170,13 +170,13 @@ class TpModelWorker:
     def forward_batch_generation(
         self,
         model_worker_batch: ModelWorkerBatch,
-        launch_done: Optional[threading.Event] = None,
         skip_sample: bool = False,
     ) -> Tuple[LogitsProcessorOutput, Optional[torch.Tensor]]:
         forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
         logits_output = self.model_runner.forward(forward_batch)
-        if launch_done:
-            launch_done.set()
+        if model_worker_batch.launch_done is not None:
+            model_worker_batch.launch_done.set()
         if skip_sample:
             next_token_ids = None

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -132,7 +132,6 @@ class TpModelWorkerClient:
             batch_pt += 1
             # Create event
-            self.launch_done = threading.Event()
             copy_done = torch.get_device_module(self.device).Event()
             # Resolve future tokens in the input
@@ -141,7 +140,7 @@ class TpModelWorkerClient:
             # Run forward
             logits_output, next_token_ids = self.worker.forward_batch_generation(
-                model_worker_batch, self.launch_done
+                model_worker_batch
             )
             # Update the future token ids map
@@ -168,10 +167,16 @@ class TpModelWorkerClient:
             self.output_queue.put((copy_done, logits_output, next_token_ids))
-    def resolve_batch_result(self, bid: int):
+    def resolve_last_batch_result(self, launch_done: Optional[threading.Event] = None):
+        """
+        This function is called to resolve the last batch result and
+        wait for the current batch to be launched. Used in overlap mode.
+        """
         copy_done, logits_output, next_token_ids = self.output_queue.get()
+        if launch_done is not None:
+            launch_done.wait()
         copy_done.synchronize()
-        self.launch_done.wait()
         if logits_output.next_token_logprobs is not None:
             logits_output.next_token_logprobs = (

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -271,6 +271,7 @@ class ModelRunner:
                     "fa3",
                     "triton",
                     "flashmla",
+                    "cutlass_mla",
                 ]:
                     logger.info(
                         f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
@@ -926,6 +927,12 @@ class ModelRunner:
             )
             self.attn_backend = FlashAttentionBackend(self)
+        elif self.server_args.attention_backend == "cutlass_mla":
+            from sglang.srt.layers.attention.cutlass_mla_backend import (
+                CutlassMLABackend,
+            )
+            self.attn_backend = CutlassMLABackend(self)
         else:
             raise ValueError(
                 f"Invalid attention backend: {self.server_args.attention_backend}"
@@ -968,7 +975,7 @@ class ModelRunner:
         after_mem = get_available_gpu_memory(self.device, self.gpu_id)
         logger.info(
             f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s. "
-            f"avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
+            f"mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
         )
     def apply_torch_tp(self):

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -971,6 +971,8 @@ def v1_chat_generate_request(
                     )
                 for message in request.messages:
+                    if message.content is None:
+                        message.content = ""
                     if isinstance(message.content, str):
                         openai_compatible_messages.append(
                             {"role": message.role, "content": message.content}
@@ -1001,6 +1003,11 @@ def v1_chat_generate_request(
                         tokenize=True,
                         add_generation_prompt=True,
                         tools=tools,
+                        **(
+                            request.chat_template_kwargs
+                            if request.chat_template_kwargs
+                            else {}
+                        ),
                     )
                 except:
                     #  This except branch will be triggered when the chosen model
@@ -1012,6 +1019,11 @@ def v1_chat_generate_request(
                         tokenize=True,
                         add_generation_prompt=True,
                         tools=tools,
+                        **(
+                            request.chat_template_kwargs
+                            if request.chat_template_kwargs
+                            else {}
+                        ),
                     )
                 if assistant_prefix:
@@ -1179,6 +1191,7 @@ def v1_chat_generate_request(
         modalities=modalities_list,
         lora_path=lora_paths,
         bootstrap_host=all_requests[0].bootstrap_host,
+        bootstrap_port=all_requests[0].bootstrap_port,
         bootstrap_room=all_requests[0].bootstrap_room,
     )
@@ -1245,16 +1258,34 @@ def v1_chat_generate_response(
         tool_calls = None
         text = ret_item["text"]
+        enable_thinking = True
         if isinstance(request, list):
             tool_choice = request[idx].tool_choice
             tools = request[idx].tools
             separate_reasoning = request[idx].separate_reasoning
+            if (
+                request[idx].chat_template_kwargs
+                and request[idx].chat_template_kwargs.get("enable_thinking") is not None
+            ):
+                enable_thinking = request[idx].chat_template_kwargs.get(
+                    "enable_thinking", True
+                )
         else:
             tool_choice = request.tool_choice
             tools = request.tools
             separate_reasoning = request.separate_reasoning
-        if reasoning_parser and separate_reasoning:
+            if (
+                request.chat_template_kwargs
+                and request.chat_template_kwargs.get("enable_thinking") is not None
+            ):
+                enable_thinking = request.chat_template_kwargs.get(
+                    "enable_thinking", True
+                )
+        reasoning_text = None
+        if reasoning_parser and separate_reasoning and enable_thinking:
             try:
                 parser = ReasoningParser(
                     model_type=reasoning_parser, stream_reasoning=False
@@ -1266,8 +1297,6 @@ def v1_chat_generate_response(
                     HTTPStatus.BAD_REQUEST,
                     "Failed to parse reasoning related info to json format!",
                 )
-        else:
-            reasoning_text = None
         if tool_choice != "none" and tools:
             parser = FunctionCallParser(tools, tool_call_parser)

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -361,9 +361,11 @@ class ChatCompletionRequest(BaseModel):
     session_params: Optional[Dict] = None
     separate_reasoning: bool = True
     stream_reasoning: bool = True
+    chat_template_kwargs: Optional[Dict] = None
     # For PD disaggregation
     bootstrap_host: Optional[str] = None
+    bootstrap_port: Optional[int] = None
     bootstrap_room: Optional[int] = None

sglang/srt/reasoning_parser.py CHANGED Viewed

@@ -117,6 +117,29 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
         # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
+class Qwen3Detector(BaseReasoningFormatDetector):
+    """
+    Detector for Qwen3 model.
+    Assumes reasoning format:
+      (<think>)*(.*)</think>
+    Returns all the text before the </think> tag as `reasoning_text`
+    and the rest of the text as `normal_text`.
+    Args:
+        stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
+            If True, streams reasoning content as it arrives.
+    """
+    def __init__(self, stream_reasoning: bool = True):
+        # Qwen3 is assumed to be reasoning until `</think>` token
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=True,
+            stream_reasoning=stream_reasoning,
+        )
 class ReasoningParser:
     """
     Parser that handles both streaming and non-streaming scenarios for extracting
@@ -129,7 +152,8 @@ class ReasoningParser:
     """
     DetectorMap: Dict[str, BaseReasoningFormatDetector] = {
-        "deepseek-r1": DeepSeekR1Detector
+        "deepseek-r1": DeepSeekR1Detector,
+        "qwen3": Qwen3Detector,
     }
     def __init__(self, model_type: str = None, stream_reasoning: bool = True):

sglang/srt/server_args.py CHANGED Viewed

@@ -256,6 +256,12 @@ class ServerArgs:
             )
             self.page_size = 64
+        if self.attention_backend == "cutlass_mla":
+            logger.warning(
+                "Cutlass MLA only supports a page_size of 128, change page_size to 128."
+            )
+            self.page_size = 128
         # Set cuda graph max batch size
         if self.cuda_graph_max_bs is None:
             # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
@@ -420,7 +426,7 @@ class ServerArgs:
         parser.add_argument(
             "--skip-tokenizer-init",
             action="store_true",
-            help="If set, skip init tokenizer and pass input_ids in generate request",
+            help="If set, skip init tokenizer and pass input_ids in generate request.",
         )
         parser.add_argument(
             "--enable-tokenizer-batch-encode",
@@ -559,6 +565,7 @@ class ServerArgs:
             "name, a tag name, or a commit id. If unspecified, will use "
             "the default version.",
         )
         # Memory and scheduling
         parser.add_argument(
             "--mem-fraction-static",
@@ -823,7 +830,14 @@ class ServerArgs:
         parser.add_argument(
             "--attention-backend",
             type=str,
-            choices=["flashinfer", "triton", "torch_native", "fa3", "flashmla"],
+            choices=[
+                "flashinfer",
+                "triton",
+                "torch_native",
+                "fa3",
+                "flashmla",
+                "cutlass_mla",
+            ],
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )

sglang/srt/utils.py CHANGED Viewed

@@ -1970,8 +1970,11 @@ def is_fa3_default_architecture(hf_config):
         "Llama4ForConditionalGeneration",
         "LlamaForCausalLM",
         "MistralForCausalLM",
+        "MixtralForCausalLM",
         "Gemma2ForCausalLM",
         "Gemma3ForConditionalGeneration",
+        "Qwen3ForCausalLM",
+        "Qwen3MoeForCausalLM",
     }
     return architectures[0] in default_archs

sglang 0.4.6__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl

sglang 0.4.6py3-none-any.whl → 0.4.6.post1py3-none-any.whl