PyPI - sglang - Versions diffs - 0.5.1.post1__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl - Mend

sglang 0.5.1.post1py3-none-any.whl → 0.5.1.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

sglang/bench_one_batch_server.py +79 -53
sglang/bench_serving.py +186 -14
sglang/profiler.py +0 -1
sglang/srt/conversation.py +38 -5
sglang/srt/disaggregation/decode.py +4 -0
sglang/srt/disaggregation/prefill.py +4 -0
sglang/srt/entrypoints/engine.py +2 -2
sglang/srt/entrypoints/openai/protocol.py +27 -24
sglang/srt/entrypoints/openai/serving_chat.py +50 -9
sglang/srt/entrypoints/openai/serving_completions.py +15 -0
sglang/srt/entrypoints/tool.py +7 -7
sglang/srt/function_call/deepseekv31_detector.py +222 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/gpt_oss_detector.py +144 -256
sglang/srt/harmony_parser.py +588 -0
sglang/srt/hf_transformers_utils.py +16 -7
sglang/srt/layers/attention/ascend_backend.py +218 -111
sglang/srt/layers/attention/flashattention_backend.py +241 -7
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +76 -91
sglang/srt/layers/attention/utils.py +15 -94
sglang/srt/layers/communicator.py +1 -2
sglang/srt/layers/moe/cutlass_moe.py +0 -15
sglang/srt/layers/moe/ep_moe/layer.py +1 -7
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -7
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
sglang/srt/layers/quantization/fp8.py +2 -1
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/mxfp4.py +16 -23
sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
sglang/srt/layers/utils.py +0 -14
sglang/srt/lora/lora_manager.py +29 -12
sglang/srt/managers/cache_controller.py +223 -156
sglang/srt/managers/detokenizer_manager.py +5 -0
sglang/srt/managers/io_struct.py +30 -0
sglang/srt/managers/scheduler.py +58 -7
sglang/srt/managers/scheduler_metrics_mixin.py +15 -0
sglang/srt/managers/tokenizer_manager.py +36 -3
sglang/srt/mem_cache/hicache_storage.py +31 -20
sglang/srt/mem_cache/hiradix_cache.py +12 -3
sglang/srt/mem_cache/memory_pool.py +73 -14
sglang/srt/mem_cache/memory_pool_host.py +3 -2
sglang/srt/mem_cache/radix_cache.py +1 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +5 -13
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +85 -81
sglang/srt/metrics/collector.py +5 -5
sglang/srt/model_executor/cuda_graph_runner.py +2 -2
sglang/srt/model_executor/model_runner.py +1 -1
sglang/srt/models/deepseek_v2.py +12 -3
sglang/srt/models/gpt_oss.py +2 -1
sglang/srt/models/qwen2_5_vl.py +1 -0
sglang/srt/offloader.py +115 -0
sglang/srt/reasoning_parser.py +56 -300
sglang/srt/server_args.py +10 -5
sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
sglang/srt/utils.py +59 -12
sglang/test/test_cutlass_moe.py +33 -28
sglang/version.py +1 -1
{sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/METADATA +6 -5
{sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/RECORD +69 -65
{sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/flashinfer_mla_backend.py CHANGED Viewed

@@ -24,13 +24,18 @@ if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1":
 from sglang.global_config import global_config
 from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
-from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.layers.attention.flashinfer_backend import (
+    create_flashinfer_kv_indices_triton,
+)
 from sglang.srt.layers.dp_attention import get_attention_tp_size
-from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import is_flashinfer_available, next_power_of_2
+from sglang.srt.utils import (
+    is_flashinfer_available,
+    is_sm100_supported,
+    next_power_of_2,
+)
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention
@@ -179,6 +184,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
         q_indptr_decode_buf: Optional[torch.Tensor] = None,
     ):
         super().__init__()
         # Parse constants
         self.max_context_len = model_runner.model_config.context_len
         self.device = model_runner.device
@@ -210,25 +216,15 @@ class FlashInferMLAAttnBackend(AttentionBackend):
         else:
             self.kv_indptr = kv_indptr_buf
-        self.kv_indices = torch.empty(
-            (max_bs * (self.max_context_len + self.page_size - 1) // self.page_size,),
-            dtype=torch.int32,
-            device=model_runner.device,
-        )
         if not self.skip_prefill:
             self.qo_indptr = torch.zeros(
                 (max_bs + 1,), dtype=torch.int32, device=model_runner.device
             )
         if q_indptr_decode_buf is None:
-            # A hack to pre-initialize large batch size for dp attention
-            if model_runner.server_args.enable_dp_attention:
-                max_bs = model_runner.server_args.dp_size * max_bs
             self.q_indptr_decode = torch.arange(
                 0, max_bs + 1, dtype=torch.int32, device=model_runner.device
             )
         else:
             self.q_indptr_decode = q_indptr_decode_buf
@@ -273,7 +269,6 @@ class FlashInferMLAAttnBackend(AttentionBackend):
         self.prefill_cuda_graph_metadata = {}  # For verify
     def init_forward_metadata(self, forward_batch: ForwardBatch):
         if forward_batch.forward_mode.is_decode_or_idle():
             self.indices_updater_decode.update(
                 forward_batch.req_pool_indices,
@@ -331,9 +326,16 @@ class FlashInferMLAAttnBackend(AttentionBackend):
         max_num_tokens: int,
         kv_indices_buf: Optional[torch.Tensor] = None,
     ):
-        self.cuda_graph_kv_indices = (
-            self.kv_indices.clone() if kv_indices_buf is None else kv_indices_buf
-        )
+        if kv_indices_buf is None:
+            cuda_graph_kv_indices = torch.zeros(
+                (max_bs * self.max_context_len,),
+                dtype=torch.int32,
+                device="cuda",
+            )
+        else:
+            cuda_graph_kv_indices = kv_indices_buf
+        self.cuda_graph_kv_indices = cuda_graph_kv_indices
         self.cuda_graph_qo_indptr = self.q_indptr_decode.clone()
         self.cuda_graph_kv_indptr = self.kv_indptr.clone()
         self.cuda_graph_kv_lens = torch.ones(
@@ -359,7 +361,6 @@ class FlashInferMLAAttnBackend(AttentionBackend):
         forward_mode: ForwardMode,
         spec_info: Optional[SpecInfo],
     ):
         if forward_mode.is_decode_or_idle():
             decode_wrapper = BatchMLAPagedAttentionWrapper(
                 self.workspace_buffer,
@@ -370,6 +371,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
                 kv_len_arr=self.cuda_graph_kv_lens[:num_tokens],
                 backend="auto",
             )
             seq_lens_sum = seq_lens.sum().item()
             self.indices_updater_decode.update(
                 req_pool_indices,
@@ -440,13 +442,11 @@ class FlashInferMLAAttnBackend(AttentionBackend):
         spec_info: Optional[SpecInfo],
         seq_lens_cpu: Optional[torch.Tensor],
     ):
         if forward_mode.is_decode_or_idle():
             assert seq_lens_cpu is not None
             kv_len_arr_cpu = seq_lens_cpu[:bs]
-            num_pages_per_req = (seq_lens_cpu + self.page_size - 1) // self.page_size
             self.cuda_graph_kv_indptr_cpu[1 : bs + 1] = torch.cumsum(
-                num_pages_per_req, dim=0
+                kv_len_arr_cpu, dim=0
             )
             self.fast_decode_kwargs.update(
                 {
@@ -455,6 +455,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
                     "kv_len_arr_cpu": kv_len_arr_cpu,
                 }
             )
             self.indices_updater_decode.update(
                 req_pool_indices[:bs],
                 seq_lens[:bs],
@@ -534,6 +535,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
             q_rope = q_rope.view(
                 -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
             )
         if self.forward_metadata.use_ragged:
             # ragged prefill
             if q_rope is not None:
@@ -554,8 +556,6 @@ class FlashInferMLAAttnBackend(AttentionBackend):
             k_buf = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
                 q.dtype
             )
-            k_buf = k_buf.view(-1, self.page_size, k_buf.shape[-1])
             if q_rope is None:
                 qall = q.view(-1, layer.tp_q_head_num, layer.head_dim)
                 q, q_rope = (
@@ -617,17 +617,17 @@ class FlashInferMLAAttnBackend(AttentionBackend):
             q_nope = reshaped_q[:, :, : layer.v_head_dim]
             q_rope = reshaped_q[:, :, layer.v_head_dim :]
-        k_buf = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
+        k_buffer = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
             q.dtype
         )
-        k_buf = k_buf.view(-1, self.page_size, k_buf.shape[-1])
         o = q_nope.new_empty(q_nope.shape)
+        # Direct call to run without the wrapper
         o = decode_wrapper.run(
             q_nope,
             q_rope,
-            k_buf[:, :, : layer.v_head_dim],
-            k_buf[:, :, layer.v_head_dim :],
+            k_buffer[:, :, : layer.v_head_dim],
+            k_buffer[:, :, layer.v_head_dim :],
             out=o,
         )
@@ -646,10 +646,9 @@ class FlashInferMLAIndicesUpdaterDecode:
         self.scaling = model_runner.model_config.scaling
         self.data_type = model_runner.dtype
         self.attn_backend = attn_backend
-        self.page_size = model_runner.page_size
         # Buffers and wrappers
         self.kv_indptr = attn_backend.kv_indptr
-        self.kv_indices = attn_backend.kv_indices
         self.req_to_token = model_runner.req_to_token_pool.req_to_token
         self.q_indptr = attn_backend.q_indptr_decode
@@ -693,17 +692,13 @@ class FlashInferMLAIndicesUpdaterDecode:
         kv_lens = paged_kernel_lens.to(torch.int32)
         sm_scale = self.scaling
         if spec_info is None:
-            num_pages_per_req = (
-                paged_kernel_lens + self.page_size - 1
-            ) // self.page_size
-            kv_indptr[1 : bs + 1] = torch.cumsum(num_pages_per_req, dim=0)
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
             kv_indptr = kv_indptr[: bs + 1]
             kv_indices = (
-                self.kv_indices[: kv_indptr[-1]]
+                torch.empty(paged_kernel_lens_sum, dtype=torch.int32, device="cuda")
                 if not init_metadata_replay
                 else fast_decode_kwargs["kv_indices"]
             )
             create_flashinfer_kv_indices_triton[(bs,)](
                 self.req_to_token,
                 req_pool_indices,
@@ -712,40 +707,39 @@ class FlashInferMLAIndicesUpdaterDecode:
                 None,
                 kv_indices,
                 self.req_to_token.shape[1],
-                self.page_size,
             )
         else:
             kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
         if not init_metadata_replay:
             wrapper.plan(
-                qo_indptr=q_indptr,
-                kv_indptr=kv_indptr,
-                kv_indices=kv_indices,
-                kv_len_arr=kv_lens,
-                num_heads=self.num_local_heads,
-                head_dim_ckv=self.kv_lora_rank,
-                head_dim_kpe=self.qk_rope_head_dim,
-                page_size=self.page_size,
-                causal=False,
-                sm_scale=sm_scale,
-                q_data_type=self.data_type,
-                kv_data_type=self.data_type,
+                q_indptr,
+                kv_indptr,
+                kv_indices,
+                kv_lens,
+                self.num_local_heads,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+                1,
+                False,
+                sm_scale,
+                self.data_type,
+                self.data_type,
             )
         else:
             wrapper.plan(
-                qo_indptr_cpu=fast_decode_kwargs["qo_indptr_cpu"],
-                kv_indptr_cpu=fast_decode_kwargs["kv_indptr_cpu"],
-                kv_indices=kv_indices,
-                kv_len_arr_cpu=fast_decode_kwargs["kv_len_arr_cpu"],
-                num_heads=self.num_local_heads,
-                head_dim_ckv=self.kv_lora_rank,
-                head_dim_kpe=self.qk_rope_head_dim,
-                page_size=self.page_size,
-                causal=False,
-                sm_scale=sm_scale,
-                q_data_type=self.data_type,
-                kv_data_type=self.data_type,
+                fast_decode_kwargs["qo_indptr_cpu"],
+                fast_decode_kwargs["kv_indptr_cpu"],
+                kv_indices,
+                fast_decode_kwargs["kv_len_arr_cpu"],
+                self.num_local_heads,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+                1,
+                False,
+                sm_scale,
+                self.data_type,
+                self.data_type,
             )
@@ -767,14 +761,12 @@ class FlashInferMLAIndicesUpdaterPrefill:
         # Buffers and wrappers
         self.kv_indptr = attn_backend.kv_indptr
         self.qo_indptr = attn_backend.qo_indptr
-        self.kv_indices = attn_backend.kv_indices
         self.req_to_token = model_runner.req_to_token_pool.req_to_token
         self.prefill_wrapper_ragged = attn_backend.prefill_wrapper_ragged
-        self.page_size = model_runner.page_size
     def update(
         self,
-        req_pool_indices: torch.Tensor,
+        req_pool_indices: torch.Tnesor,
         seq_lens: torch.Tensor,
         seq_lens_sum: int,
         prefix_lens: torch.Tensor,
@@ -788,6 +780,7 @@ class FlashInferMLAIndicesUpdaterPrefill:
         else:
             paged_kernel_lens = seq_lens
             paged_kernel_lens_sum = seq_lens_sum
         self.call_begin_forward(
             self.prefill_wrapper_ragged,
             prefill_wrapper_paged,
@@ -821,12 +814,13 @@ class FlashInferMLAIndicesUpdaterPrefill:
         if spec_info is None:
             assert len(seq_lens) == len(req_pool_indices)
-            num_pages_per_req = (
-                paged_kernel_lens + self.page_size - 1
-            ) // self.page_size
-            kv_indptr[1 : bs + 1] = torch.cumsum(num_pages_per_req, dim=0)
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
             kv_indptr = kv_indptr[: bs + 1]
-            kv_indices = self.kv_indices[: kv_indptr[-1]]
+            kv_indices = torch.empty(
+                paged_kernel_lens_sum,
+                dtype=torch.int32,
+                device=req_pool_indices.device,
+            )
             create_flashinfer_kv_indices_triton[(bs,)](
                 self.req_to_token,
                 req_pool_indices,
@@ -835,7 +829,6 @@ class FlashInferMLAIndicesUpdaterPrefill:
                 None,
                 kv_indices,
                 self.req_to_token.shape[1],
-                self.page_size,
             )
             qo_indptr[1 : bs + 1] = torch.cumsum(seq_lens - prefix_lens, dim=0)
             qo_indptr = qo_indptr[: bs + 1]
@@ -853,6 +846,7 @@ class FlashInferMLAIndicesUpdaterPrefill:
                     self.req_to_token,
                 )
             )
         if use_ragged:
             # ragged prefill
             wrapper_ragged.begin_forward(
@@ -867,26 +861,20 @@ class FlashInferMLAIndicesUpdaterPrefill:
             )
         else:
             # mla paged prefill
-            if spec_info is not None:
-                assert (
-                    self.page_size == 1
-                ), "Only page_size=1 is supported for flashinfer backend with speculative decoding"
-                kv_lens = kv_indptr[1:] - kv_indptr[:-1]
-            else:
-                kv_lens = paged_kernel_lens.to(torch.int32)
+            kv_len_arr = kv_indptr[1:] - kv_indptr[:-1]
             wrapper_paged.plan(
-                qo_indptr=qo_indptr,
-                kv_indptr=kv_indptr,
-                kv_indices=kv_indices,
-                kv_len_arr=kv_lens,
-                num_heads=self.num_local_heads,
-                head_dim_ckv=self.kv_lora_rank,
-                head_dim_kpe=self.qk_rope_head_dim,
-                page_size=self.page_size,
-                causal=True,
-                sm_scale=sm_scale,
-                q_data_type=self.q_data_type,
-                kv_data_type=self.data_type,
+                qo_indptr,
+                kv_indptr,
+                kv_indices,
+                kv_len_arr,
+                self.num_local_heads,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+                1,
+                True,
+                sm_scale,
+                self.q_data_type,
+                self.data_type,
             )
@@ -981,7 +969,6 @@ class FlashInferMLAMultiStepDraftBackend:
             call_fn(i, forward_batch)
     def init_forward_metadata(self, forward_batch: ForwardBatch):
         kv_indices = torch.zeros(
             (
                 self.speculative_num_steps,
@@ -1017,7 +1004,6 @@ class FlashInferMLAMultiStepDraftBackend:
             )
     def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
         def call_fn(i, forward_batch):
             self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
                 forward_batch.batch_size,
@@ -1034,7 +1020,6 @@ class FlashInferMLAMultiStepDraftBackend:
     def init_forward_metadata_replay_cuda_graph(
         self, forward_batch: ForwardBatch, bs: int
     ):
         def call_fn(i, forward_batch):
             self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
                 bs,

sglang/srt/layers/attention/utils.py CHANGED Viewed

@@ -9,89 +9,18 @@ TRITON_PAD_NUM_PAGE_PER_BLOCK = 64
 @triton.jit
 def create_flashinfer_kv_indices_triton(
-    req_to_token_ptr,
+    req_to_token_ptr,  # [max_batch, max_context_len]
     req_pool_indices_ptr,
     page_kernel_lens_ptr,
     kv_indptr,
     kv_start_idx,
     kv_indices_ptr,
     req_to_token_ptr_stride: tl.constexpr,
-    PAGE_SIZE: tl.constexpr = 1,
 ):
-    """
-    Create KV indices for FlashInfer attention backend.
-    This Triton kernel builds a lookup table that maps from logical request/token
-    coordinates to physical token locations in the global KV cache pool. It's used
-    by FlashInfer attention backends to efficiently access scattered KV cache data.
-    The kernel processes each request in parallel and converts the req_to_token
-    lookup table into a flat list of token indices that can be used by attention kernels.
-    general idea:
-        blocktables/kv_indices_ptr = [batch_size * max_pages(for graph mode with
-                                                            fixed number of pages)]
-        max_pages = max_context_len / PAGED_SIZE
-        kv_indices_ptr will store the flat list of the pages used by each request
-    Args:
-        Inputs Arguments (non mutable):
-        req_to_token_ptr: Request to token location look up table
-                         Shape: [max_batch, max_context_len]
-        req_pool_indices_ptr: Request to pool index look up table. Each request uses
-                             one pool.
-                             Shape: [batch_size]
-        page_kernel_lens_ptr: sequence lengths per request
-                             Shape: [batch_size]
-        kv_indptr: Should be computed based on number of pages used by each request.
-                   It is used by flashinfer attention kernels to index into the kv_indices_ptr.
-                   per request.
-                  Shape: [batch_size + 1]
-                  kv_indptr[i] = start index in kv_indices for request i
-        kv_start_idx: Pointer to array containing start offsets for each request in SGL.
-                     Can be None. If provided, adds offset to token positions.
-        req_to_token_ptr_stride: Stride for the second dimension of req_to_token.
-                                Equal to max_context_len.
-        PAGED_SIZE: Number of tokens per page. Default is 1 for FlashInfer.
-        Outputs:
-        kv_indices_ptr: Pointer to output array where KV indices will be stored.
-                    Shape:[total-num-pages],
-                    where total_num_pages = sum(seq_lens // PAGED_SIZE)
-    Example:
-        If we have:
-        - req_pool_indices = [0, 1] (request 0 uses pool 0, request 1 uses pool 1)
-        - page_kernel_lens = [3, 2] (request 0 has 3 tokens, request 1 has 2 tokens)
-        - req_to_token = [[10, 11, 12, -1], [20, 21, -1, -1]] (tokens are the elements
-         in radix tree, use them as a pointer to the token location in the kv_indices_ptr)
-        The kernel will output:
-        If PAGE_SIZE = 1:
-        packed
-        - kv_indptr (passed in as input arg): [0,3,5]
-        - kv_indices = [10, 11, 12, 20, 21]
-        padded - max_pages is 10 tokens per req
-        - kv_indptr (passed in as input arg): [0,10, 20]
-        - kv_indices = [10, 11, 12, -1, -1, -1, -1, -1, -1, -1,
-                        20, 21, -1, -1, -1, -1, -1, -1, -1, -1]
-        If PAGE_SIZE = 2
-        packed:
-        - kv_indptr (passed in as input arg): [0,3,4]
-        - kv_indices = [5,6,10]
-        padded: max_pages is 4
-        - kv_indptr (passed in as input arg): [0,4,8,..] (note that 4 is the max_pages)
-        - kv_indices = [5, 6, -1, -1,
-                        10, -1, -1, -1]
-        This allows attention kernels to directly access the correct KV cache
-        entries for each request's tokens.
-    """
     BLOCK_SIZE: tl.constexpr = 512
-    NUM_PAGES_PER_BLOCK: tl.constexpr = BLOCK_SIZE // PAGE_SIZE
     pid = tl.program_id(axis=0)
+    # find the req pool idx, this is for batch to token
     req_pool_index = tl.load(req_pool_indices_ptr + pid)
     kv_indices_offset = tl.load(kv_indptr + pid)
@@ -102,27 +31,19 @@ def create_flashinfer_kv_indices_triton(
         kv_end = kv_start
     kv_end += tl.load(page_kernel_lens_ptr + pid).to(tl.int32)
-    kv_range = kv_end - kv_start
-    num_pages = tl.cdiv(kv_range, PAGE_SIZE)
-    num_loops = tl.cdiv(kv_range, BLOCK_SIZE)
-    req_to_token_block_start = (
-        req_to_token_ptr + req_pool_index * req_to_token_ptr_stride + kv_start
-    )
-    for i in range(num_loops):
-        token_offsets_in_block = (
-            tl.arange(0, NUM_PAGES_PER_BLOCK).to(tl.int64) + i * NUM_PAGES_PER_BLOCK
-        ) * PAGE_SIZE
-        page_offsets_in_block = token_offsets_in_block // PAGE_SIZE
-        valid_tokens = token_offsets_in_block < kv_range
-        valid_pages = page_offsets_in_block < num_pages
-        token_numbers = tl.load(
-            req_to_token_block_start + token_offsets_in_block, mask=valid_tokens
-        )
-        tl.store(
-            kv_indices_ptr + kv_indices_offset + page_offsets_in_block,
-            token_numbers // PAGE_SIZE,  # write the page numbers to kv_indices_ptr
-            mask=valid_pages,
+    num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
+    for i in range(num_loop):
+        # index into req_to_token_ptr needs to be int64
+        offset = tl.arange(0, BLOCK_SIZE).to(tl.int64) + i * BLOCK_SIZE
+        mask = offset < kv_end - kv_start
+        data = tl.load(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + kv_start
+            + offset,
+            mask=mask,
         )
+        tl.store(kv_indices_ptr + kv_indices_offset + offset, data, mask=mask)
 @triton.jit

sglang/srt/layers/communicator.py CHANGED Viewed

@@ -40,10 +40,9 @@ from sglang.srt.layers.moe import (
     get_moe_a2a_backend,
     should_use_flashinfer_cutlass_moe_fp4_allgather,
 )
-from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.utils import is_cuda, is_flashinfer_available
+from sglang.srt.utils import is_cuda, is_flashinfer_available, is_sm100_supported
 _is_flashinfer_available = is_flashinfer_available()
 _is_sm100_supported = is_cuda() and is_sm100_supported()

sglang/srt/layers/moe/cutlass_moe.py CHANGED Viewed

@@ -1,20 +1,12 @@
 """CUTLASS based Fused MoE kernels."""
-import functools
-import json
-import logging
-import os
-from typing import Any, Callable, Dict, List, Optional, Tuple
 import torch
 from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams
-from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
 from sglang.srt.utils import is_cuda
 _is_cuda = is_cuda()
 if _is_cuda:
-    import sgl_kernel
     from sgl_kernel import (
         apply_shuffle_mul_sum,
         cutlass_fp4_group_mm,
@@ -157,10 +149,6 @@ def cutlass_fused_experts_fp8(
     rep_a_q = shuffle_rows(a_q, a_map, (m * topk, k))
     rep_a1_scales = shuffle_rows(a1_scale, a_map, (m * topk, int(k / 128)))
-    if not is_sm100_supported():
-        rep_a1_scales = per_group_transpose(rep_a1_scales, expert_offsets)
-        w1_scale = w1_scale.contiguous()
     c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
     c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
@@ -192,9 +180,6 @@ def cutlass_fused_experts_fp8(
     silu_and_mul(c1, intermediate)
     intemediate_q, a2_scale = sglang_per_token_group_quant_fp8(intermediate, 128)
-    if not is_sm100_supported():
-        a2_scale = per_group_transpose(a2_scale, expert_offsets)
-        w2_scale = w2_scale.contiguous()
     fp8_blockwise_scaled_grouped_mm(
         c2,

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -248,7 +248,6 @@ class EPMoE(FusedMoE):
             gateup_output,
             masked_m,
             expected_m,
-            recipe=(1, 128, 128) if deep_gemm_wrapper.DEEPGEMM_BLACKWELL else None,
         )
         del gateup_input
         del gateup_input_fp8
@@ -304,7 +303,6 @@ class EPMoE(FusedMoE):
             down_output,
             masked_m,
             expected_m,
-            recipe=(1, 128, 128) if deep_gemm_wrapper.DEEPGEMM_BLACKWELL else None,
         )
         del down_input
         del down_input_fp8
@@ -667,7 +665,6 @@ class DeepEPMoE(EPMoE):
             gateup_output,
             masked_m,
             expected_m,
-            recipe=(1, 128, 128) if deep_gemm_wrapper.DEEPGEMM_BLACKWELL else None,
         )
         dispose_tensor(hidden_states_fp8[0])
@@ -708,9 +705,7 @@ class DeepEPMoE(EPMoE):
             (
                 down_input_scale
                 if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
-                else deep_gemm_wrapper.get_col_major_tma_aligned_tensor(
-                    down_input_scale
-                )
+                else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(down_input_scale)
             ),
         )
         down_output = torch.empty(
@@ -722,7 +717,6 @@ class DeepEPMoE(EPMoE):
             down_output,
             masked_m,
             expected_m,
-            recipe=(1, 128, 128) if deep_gemm_wrapper.DEEPGEMM_BLACKWELL else None,
         )
         return down_output

sglang 0.5.1.post1__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl

sglang 0.5.1.post1py3-none-any.whl → 0.5.1.post3py3-none-any.whl