PyPI - sglang - Versions diffs - 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl - Mend

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

sglang/bench_one_batch.py +2 -1
sglang/eval/loogle_eval.py +7 -0
sglang/srt/configs/deepseekvl2.py +11 -2
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +9 -7
sglang/srt/configs/update_config.py +3 -1
sglang/srt/conversation.py +1 -0
sglang/srt/custom_op.py +5 -2
sglang/srt/disaggregation/decode.py +9 -1
sglang/srt/disaggregation/mooncake/conn.py +44 -56
sglang/srt/distributed/parallel_state.py +33 -0
sglang/srt/entrypoints/engine.py +30 -26
sglang/srt/entrypoints/openai/serving_chat.py +21 -2
sglang/srt/eplb/expert_location_dispatch.py +1 -1
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/qwen3_detector.py +150 -0
sglang/srt/hf_transformers_utils.py +0 -1
sglang/srt/layers/activation.py +13 -0
sglang/srt/layers/attention/flashattention_backend.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +40 -1
sglang/srt/layers/linear.py +13 -102
sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
sglang/srt/layers/moe/ep_moe/layer.py +23 -402
sglang/srt/layers/moe/fused_moe_native.py +7 -47
sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +35 -45
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
sglang/srt/layers/moe/topk.py +187 -12
sglang/srt/layers/quantization/__init__.py +20 -134
sglang/srt/layers/quantization/awq.py +578 -11
sglang/srt/layers/quantization/awq_triton.py +339 -0
sglang/srt/layers/quantization/base_config.py +85 -10
sglang/srt/layers/quantization/blockwise_int8.py +17 -55
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +24 -73
sglang/srt/layers/quantization/fp8.py +273 -62
sglang/srt/layers/quantization/fp8_kernel.py +210 -46
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +501 -143
sglang/srt/layers/quantization/marlin_utils.py +790 -0
sglang/srt/layers/quantization/modelopt_quant.py +26 -108
sglang/srt/layers/quantization/moe_wna16.py +45 -49
sglang/srt/layers/quantization/petit.py +252 -0
sglang/srt/layers/quantization/petit_utils.py +104 -0
sglang/srt/layers/quantization/qoq.py +7 -6
sglang/srt/layers/quantization/scalar_type.py +352 -0
sglang/srt/layers/quantization/unquant.py +422 -0
sglang/srt/layers/quantization/utils.py +343 -3
sglang/srt/layers/quantization/w4afp8.py +8 -4
sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
sglang/srt/layers/quantization/w8a8_int8.py +51 -115
sglang/srt/layers/vocab_parallel_embedding.py +1 -41
sglang/srt/lora/lora.py +0 -4
sglang/srt/lora/lora_manager.py +87 -53
sglang/srt/lora/mem_pool.py +81 -33
sglang/srt/lora/utils.py +12 -5
sglang/srt/managers/cache_controller.py +241 -0
sglang/srt/managers/io_struct.py +41 -29
sglang/srt/managers/mm_utils.py +7 -8
sglang/srt/managers/schedule_batch.py +150 -110
sglang/srt/managers/schedule_policy.py +68 -27
sglang/srt/managers/scheduler.py +243 -61
sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
sglang/srt/managers/tokenizer_manager.py +11 -3
sglang/srt/managers/tp_worker.py +14 -0
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/allocator.py +7 -16
sglang/srt/mem_cache/base_prefix_cache.py +14 -2
sglang/srt/mem_cache/chunk_cache.py +5 -2
sglang/srt/mem_cache/hicache_storage.py +152 -0
sglang/srt/mem_cache/hiradix_cache.py +179 -4
sglang/srt/mem_cache/memory_pool.py +16 -1
sglang/srt/mem_cache/memory_pool_host.py +41 -2
sglang/srt/mem_cache/radix_cache.py +26 -0
sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +5 -6
sglang/srt/model_executor/forward_batch_info.py +14 -1
sglang/srt/model_executor/model_runner.py +109 -22
sglang/srt/model_loader/loader.py +7 -1
sglang/srt/model_loader/utils.py +4 -4
sglang/srt/models/clip.py +1 -1
sglang/srt/models/deepseek.py +9 -6
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +191 -171
sglang/srt/models/deepseek_vl2.py +5 -5
sglang/srt/models/gemma.py +48 -0
sglang/srt/models/gemma2.py +52 -0
sglang/srt/models/gemma3_causal.py +63 -0
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -4
sglang/srt/models/granitemoe.py +385 -0
sglang/srt/models/grok.py +9 -3
sglang/srt/models/hunyuan.py +63 -16
sglang/srt/models/internvl.py +1 -1
sglang/srt/models/kimi_vl.py +1 -1
sglang/srt/models/llama.py +41 -0
sglang/srt/models/llama4.py +11 -11
sglang/srt/models/llava.py +2 -2
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +0 -2
sglang/srt/models/minicpmo.py +3 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mixtral.py +9 -2
sglang/srt/models/mllama.py +3 -5
sglang/srt/models/mllama4.py +3 -3
sglang/srt/models/olmoe.py +8 -5
sglang/srt/models/persimmon.py +330 -0
sglang/srt/models/phi.py +321 -0
sglang/srt/models/phi4mm.py +44 -4
sglang/srt/models/phi4mm_audio.py +1260 -0
sglang/srt/models/phi4mm_utils.py +1917 -0
sglang/srt/models/phimoe.py +9 -3
sglang/srt/models/qwen.py +37 -0
sglang/srt/models/qwen2.py +41 -0
sglang/srt/models/qwen2_5_vl.py +4 -4
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +53 -5
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/qwen3.py +65 -1
sglang/srt/models/qwen3_moe.py +56 -18
sglang/srt/models/vila.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +91 -97
sglang/srt/multimodal/processors/clip.py +21 -19
sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
sglang/srt/multimodal/processors/gemma3.py +13 -17
sglang/srt/multimodal/processors/gemma3n.py +19 -23
sglang/srt/multimodal/processors/internvl.py +9 -10
sglang/srt/multimodal/processors/janus_pro.py +12 -27
sglang/srt/multimodal/processors/kimi_vl.py +12 -14
sglang/srt/multimodal/processors/llava.py +4 -2
sglang/srt/multimodal/processors/minicpm.py +35 -44
sglang/srt/multimodal/processors/mlama.py +21 -18
sglang/srt/multimodal/processors/mllama4.py +4 -5
sglang/srt/multimodal/processors/phi4mm.py +63 -39
sglang/srt/multimodal/processors/pixtral.py +14 -35
sglang/srt/multimodal/processors/qwen_audio.py +65 -0
sglang/srt/multimodal/processors/qwen_vl.py +16 -21
sglang/srt/multimodal/processors/vila.py +14 -14
sglang/srt/sampling/sampling_params.py +8 -1
sglang/srt/server_args.py +393 -230
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +9 -1
sglang/srt/two_batch_overlap.py +1 -0
sglang/srt/utils.py +27 -1
sglang/test/runners.py +14 -3
sglang/test/test_block_fp8.py +8 -3
sglang/test/test_block_fp8_ep.py +1 -1
sglang/test/test_custom_ops.py +12 -7
sglang/test/test_cutlass_w4a8_moe.py +1 -3
sglang/test/test_fp4_moe.py +1 -3
sglang/test/test_marlin_moe.py +286 -0
sglang/test/test_marlin_utils.py +171 -0
sglang/test/test_utils.py +35 -0
sglang/version.py +1 -1
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/METADATA +8 -8
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/RECORD +166 -146
sglang/srt/layers/quantization/quant_utils.py +0 -166
sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/top_level.txt +0 -0

sglang/srt/function_call/qwen3_detector.py ADDED Viewed

@@ -0,0 +1,150 @@
+import ast
+import html
+import json
+import logging
+import re
+from typing import Any, Dict, List, Tuple
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+logger = logging.getLogger(__name__)
+def _safe_val(raw: str) -> Any:
+    raw = html.unescape(raw.strip())
+    try:
+        return json.loads(raw)
+    except Exception:
+        try:
+            return ast.literal_eval(raw)
+        except Exception:
+            return raw
+class Qwen3XMLDetector(BaseFormatDetector):
+    """
+    Detector for Qwen 3 models.
+    Assumes function call format:
+        <tool_call>
+        <function=execute_bash>
+        <parameter=command>
+        pwd && ls
+        </parameter>
+        </function>
+        </tool_call>
+    """
+    def __init__(self):
+        super().__init__()
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+        self.tool_call_prefix: str = "<function="
+        self.tool_call_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL
+        )
+        self.tool_call_function_regex = re.compile(
+            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
+        )
+        self.tool_call_parameter_regex = re.compile(
+            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL
+        )
+        self._buf: str = ""
+    def has_tool_call(self, text: str) -> bool:
+        return self.tool_call_start_token in text
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        normal, calls = self._extract(text, tools)
+        return StreamingParseResult(normal_text=normal, calls=calls)
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        self._buf += new_text
+        normal = ""
+        calls: List[ToolCallItem] = []
+        while True:
+            if self.tool_call_start_token not in self._buf:
+                normal += self._buf
+                self._buf = ""
+                break
+            s = self._buf.find(self.tool_call_start_token)
+            if s > 0:
+                normal += self._buf[:s]
+                self._buf = self._buf[s:]
+            e = self._buf.find(self.tool_call_end_token)
+            if e == -1:
+                break
+            block = self._buf[: e + len(self.tool_call_end_token)]
+            self._buf = self._buf[e + len(self.tool_call_end_token) :]
+            calls.extend(self._parse_block(block, tools))
+        return StreamingParseResult(normal_text=normal, calls=calls)
+    def _extract(self, text: str, tools: List[Tool]) -> Tuple[str, List[ToolCallItem]]:
+        normal_parts: List[str] = []
+        calls: List[ToolCallItem] = []
+        cursor = 0
+        while True:
+            s = text.find(self.tool_call_start_token, cursor)
+            if s == -1:
+                normal_parts.append(text[cursor:])
+                break
+            normal_parts.append(text[cursor:s])
+            e = text.find(self.tool_call_end_token, s)
+            if e == -1:
+                normal_parts.append(text[s:])
+                break
+            block = text[s : e + len(self.tool_call_end_token)]
+            cursor = e + len(self.tool_call_end_token)
+            calls.extend(self._parse_block(block, tools))
+        return "".join(normal_parts), calls
+    def _parse_block(self, block: str, tools: List[Tool]) -> List[ToolCallItem]:
+        res: List[ToolCallItem] = []
+        for m in self.tool_call_function_regex.findall(block):
+            txt = m[0] if m[0] else m[1]
+            if ">" not in txt:
+                continue
+            idx = txt.index(">")
+            fname = txt[:idx].strip()
+            body = txt[idx + 1 :]
+            params: Dict[str, Any] = {}
+            for pm in self.tool_call_parameter_regex.findall(body):
+                ptxt = pm[0] if pm[0] else pm[1]
+                if ">" not in ptxt:
+                    continue
+                pidx = ptxt.index(">")
+                pname = ptxt[:pidx].strip()
+                pval = ptxt[pidx + 1 :].lstrip("\n").rstrip("\n")
+                params[pname] = _safe_val(pval)
+            raw = {"name": fname, "arguments": params}
+            try:
+                res.extend(self.parse_base_json(raw, tools))
+            except Exception:
+                logger.warning("invalid tool call for %s dropped", fname)
+        return res
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda n: StructureInfo(
+            begin=f"{self.tool_call_start_token}\n<function={n}>",
+            end=f"</function>\n{self.tool_call_end_token}",
+            trigger=self.tool_call_start_token,
+        )
+    # TODO: fake ebnf for xml + outlines backend
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            individual_call_start_token=self.tool_call_start_token.replace("\n", "\\n"),
+            individual_call_end_token=self.tool_call_end_token.replace("\n", "\\n"),
+            tool_call_separator="\\n",
+            function_format="json",
+        )

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -167,7 +167,6 @@ def get_generation_config(
             model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
         )
     except OSError as e:
-        logging.info("model doesn't have generation_config.json")
         return None

sglang/srt/layers/activation.py CHANGED Viewed

@@ -110,6 +110,17 @@ class NewGELU(CustomOp):
         return self.forward_native(x)
+class ReLU2(nn.Module):
+    """
+    Applies the squared Rectified Linear Unit function.
+    y = max(0, x)^2
+    """
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.relu(x)
+        return x * x
 class QuickGELU(CustomOp):
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return x * torch.sigmoid(1.702 * x)
@@ -164,6 +175,8 @@ class ScaledActivation(nn.Module):
 _ACTIVATION_REGISTRY = {
     "gelu": nn.GELU(),
     "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
+    "gelu_new": NewGELU(),
+    "relu2": ReLU2(),
 }

sglang/srt/layers/attention/flashattention_backend.py CHANGED Viewed

@@ -1617,7 +1617,7 @@ class FlashAttentionBackend(AttentionBackend):
                         metadata.max_seq_len_k + self.page_size - 1
                     ) // self.page_size
-                    normal_decode_set_medadata(
+                    normal_decode_set_metadata(
                         metadata.cache_seqlens_int32,
                         metadata.cu_seqlens_k,
                         metadata.page_table,
@@ -1666,7 +1666,7 @@ class FlashAttentionBackend(AttentionBackend):
                 max_seq_pages = (max_len + self.page_size - 1) // self.page_size
                 metadata.max_seq_len_k = max_len
-                normal_decode_set_medadata(
+                normal_decode_set_metadata(
                     metadata.cache_seqlens_int32,
                     metadata.cu_seqlens_k,
                     metadata.page_table,
@@ -2089,7 +2089,7 @@ class FlashAttentionMultiStepBackend:
 # @torch.compile(dynamic=True, backend=get_compiler_backend())
 # TODO: fuse these kernels
 # NOTE: torch.compile makes it slower in speculative decoding
-def normal_decode_set_medadata(
+def normal_decode_set_metadata(
     cache_seqlens_int32: torch.Tensor,
     cu_seqlens_k: torch.Tensor,
     page_table: torch.Tensor,

sglang/srt/layers/attention/flashinfer_backend.py CHANGED Viewed

@@ -25,7 +25,9 @@ from sglang.global_config import global_config
 from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
 from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
 from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.layers.radix_attention import AttentionType
 from sglang.srt.layers.utils import is_sm100_supported
+from sglang.srt.mem_cache.allocator import SWATokenToKVPoolAllocator
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
 from sglang.srt.utils import is_flashinfer_available, next_power_of_2
@@ -485,12 +487,20 @@ class FlashInferAttnBackend(AttentionBackend):
                 v_scale=layer.v_scale,
             )
         else:
+            causal = True
+            if layer.attn_type == AttentionType.ENCODER_ONLY:
+                save_kv_cache = False
+                causal = False
             if self.forward_metadata.extend_no_prefix:
+                # NOTE: FlashInfer currently has limitations with head_dim = 32 or other dimensions
+                # The FlashInfer head_dim limitation itself is tracked here:
+                # https://github.com/flashinfer-ai/flashinfer/issues/1048
                 o = self.prefill_wrapper_ragged.forward(
                     q.view(-1, layer.tp_q_head_num, layer.head_dim),
                     k.view(-1, layer.tp_k_head_num, layer.head_dim),
                     v.view(-1, layer.tp_v_head_num, layer.head_dim),
-                    causal=True,
+                    causal=causal,
                     sm_scale=layer.scaling,
                     logits_soft_cap=logits_soft_cap,
                 )
@@ -589,6 +599,7 @@ class FlashInferIndicesUpdaterDecode:
         self.kv_indptr = attn_backend.kv_indptr
         self.kv_last_page_len = attn_backend.kv_last_page_len
         self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
         # Dispatch the update function
         if self.attn_backend.dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
@@ -655,6 +666,10 @@ class FlashInferIndicesUpdaterDecode:
                 paged_kernel_lens_sum_tmp = seq_lens_sum
                 kv_start_idx_tmp = None
+            use_sliding_window_kv_pool = wrapper_id == 0 and isinstance(
+                self.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator
+            )
             self.call_begin_forward(
                 decode_wrappers[wrapper_id],
                 req_pool_indices,
@@ -663,6 +678,7 @@ class FlashInferIndicesUpdaterDecode:
                 self.kv_indptr[wrapper_id],
                 kv_start_idx_tmp,
                 spec_info,
+                use_sliding_window_kv_pool=use_sliding_window_kv_pool,
             )
     def update_cross_attention(
@@ -704,6 +720,7 @@ class FlashInferIndicesUpdaterDecode:
         kv_indptr: torch.Tensor,
         kv_start_idx: torch.Tensor,
         spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        use_sliding_window_kv_pool: bool = False,
     ):
         if spec_info is None:
             bs = len(req_pool_indices)
@@ -731,6 +748,14 @@ class FlashInferIndicesUpdaterDecode:
             kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
             bs = kv_indptr.shape[0] - 1
+        if use_sliding_window_kv_pool:
+            kv_last_index = kv_indptr[-1]
+            kv_indices[:kv_last_index] = (
+                self.token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                    kv_indices[:kv_last_index]
+                )
+            )
         wrapper.begin_forward(
             kv_indptr,
             kv_indices,
@@ -765,6 +790,7 @@ class FlashInferIndicesUpdaterPrefill:
         self.kv_last_page_len = attn_backend.kv_last_page_len
         self.qo_indptr = attn_backend.qo_indptr
         self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
         self.prefill_wrapper_ragged = attn_backend.prefill_wrapper_ragged
         # Dispatch the update function
@@ -848,6 +874,9 @@ class FlashInferIndicesUpdaterPrefill:
                 paged_kernel_lens_sum = seq_lens_sum
             kv_start_idx = seq_lens - paged_kernel_lens
+            use_sliding_window_kv_pool = wrapper_id == 0 and isinstance(
+                self.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator
+            )
             self.call_begin_forward(
                 self.prefill_wrapper_ragged,
@@ -862,6 +891,7 @@ class FlashInferIndicesUpdaterPrefill:
                 self.qo_indptr[wrapper_id],
                 use_ragged,
                 spec_info,
+                use_sliding_window_kv_pool=use_sliding_window_kv_pool,
             )
     def update_cross_attention(
@@ -916,6 +946,7 @@ class FlashInferIndicesUpdaterPrefill:
         qo_indptr: torch.Tensor,
         use_ragged: bool,
         spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        use_sliding_window_kv_pool: bool = False,
     ):
         bs = len(seq_lens)
         if spec_info is None:
@@ -964,6 +995,14 @@ class FlashInferIndicesUpdaterPrefill:
                 q_data_type=self.q_data_type,
             )
+        if use_sliding_window_kv_pool:
+            kv_last_index = kv_indptr[-1]
+            kv_indices[:kv_last_index] = (
+                self.token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                    kv_indices[:kv_last_index]
+                )
+            )
         # cached part
         wrapper_paged.begin_forward(
             qo_indptr,

sglang/srt/layers/linear.py CHANGED Viewed

@@ -1,12 +1,12 @@
 """Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/linear.py"""
+from __future__ import annotations
 import itertools
 import logging
-from abc import abstractmethod
-from typing import Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 import torch
-import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
 from sglang.srt.distributed import (
@@ -17,7 +17,6 @@ from sglang.srt.distributed import (
     tensor_model_parallel_all_gather,
     tensor_model_parallel_all_reduce,
 )
-from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
 from sglang.srt.layers.parameter import (
     BasevLLMParameter,
     BlockQuantScaleParameter,
@@ -27,17 +26,14 @@ from sglang.srt.layers.parameter import (
     RowvLLMParameter,
     _ColumnvLLMParameter,
 )
-from sglang.srt.layers.quantization.base_config import (
-    QuantizationConfig,
-    QuantizeMethodBase,
-)
-from sglang.srt.utils import (
-    cpu_has_amx_support,
-    is_cpu,
-    is_npu,
-    set_weight_attrs,
-    use_intel_amx_backend,
-)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.utils import is_cpu, is_npu, set_weight_attrs
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import (
+        QuantizationConfig,
+        QuantizeMethodBase,
+    )
 logger = logging.getLogger(__name__)
@@ -57,9 +53,9 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "ModelOptFp8LinearMethod",
     "ModelOptFp4LinearMethod",
     "IPEXAWQLinearMethod",
+    "PetitNvFp4LinearMethod",
 ]
-_is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
 _is_npu = is_npu()
@@ -110,91 +106,6 @@ def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
     return param[shard_id], loaded_weight
-class LinearMethodBase(QuantizeMethodBase):
-    """Base class for different (maybe quantized) linear methods."""
-    @abstractmethod
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        """Create weights for a linear layer.
-           The weights will be set as attributes of the layer.
-        Args:
-            layer: The layer that is using the LinearMethodBase factory.
-            input_size_per_partition: Size of the weight input dim on rank X.
-            output_partition_sizes: Sizes of the output dim of each logical
-                weight on rank X. E.g., output_partition_sizes for QKVLinear
-                is a list contains the width of Wq, Wk, Wv on rank X.
-            input_size: Size of the input dim of the weight across all ranks.
-            output_size: Size of the output dim of the weight across all ranks.
-            params_dtype: Datatype of the parameters.
-        """
-        raise NotImplementedError
-    @abstractmethod
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Apply the weights in layer to the input tensor.
-        Expects create_weights to have been called before on the layer."""
-        raise NotImplementedError
-class UnquantizedLinearMethod(LinearMethodBase):
-    """Linear method without quantization."""
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        weight = Parameter(
-            torch.empty(
-                sum(output_partition_sizes),
-                input_size_per_partition,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, extra_weight_attrs)
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        if _is_cpu and _is_cpu_amx_available:
-            _amx_process_weight_after_loading(layer, ["weight"])
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if use_intel_amx_backend(layer):
-            return torch.ops.sgl_kernel.weight_packed_linear(
-                x, layer.weight, bias, True  # is_vnni
-            )
-        return F.linear(x, layer.weight, bias)
 class LinearBase(torch.nn.Module):
     """Base linear layer.
@@ -310,7 +221,7 @@ class ReplicatedLinear(LinearBase):
         assert param.size() == loaded_weight.size()
         param.data.copy_(loaded_weight)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
         output = self.quant_method.apply(self, x, bias)

sglang/srt/layers/moe/ep_moe/kernels.py CHANGED Viewed

@@ -236,7 +236,8 @@ def pre_reorder_triton_kernel(
 ):
     OutDtype = gateup_input_ptr.dtype.element_ty
-    src_idx = tl.program_id(0)
+    src_idx_int32 = tl.program_id(0)
+    src_idx = src_idx_int32.to(tl.int64)
     src2dst_ptr = src2dst_ptr + src_idx * topk
     topk_ids_ptr = topk_ids_ptr + src_idx * topk
     src_ptr = input_ptr + src_idx * hidden_size
@@ -255,7 +256,8 @@ def pre_reorder_triton_kernel(
             else:
                 scale = 1.0
-            dst_idx = tl.load(src2dst_ptr + idx)
+            dst_idx_int32 = tl.load(src2dst_ptr + idx)
+            dst_idx = dst_idx_int32.to(tl.int64)
             dst_ptr = gateup_input_ptr + dst_idx * hidden_size
             for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
                 offset = start_offset + vec

sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post3py3-none-any.whl