PyPI - sglang - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl - Mend

sglang 0.3.6py3-none-any.whl → 0.3.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_one_batch.py +2 -4
sglang/bench_serving.py +75 -26
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/srt/configs/model_config.py +13 -14
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +13 -15
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +38 -57
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +13 -13
sglang/srt/layers/attention/flashinfer_backend.py +13 -6
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +13 -14
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +13 -15
sglang/srt/layers/logits_processor.py +13 -15
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +25 -19
sglang/srt/managers/detokenizer_manager.py +13 -16
sglang/srt/managers/io_struct.py +43 -28
sglang/srt/managers/schedule_batch.py +55 -26
sglang/srt/managers/schedule_policy.py +13 -15
sglang/srt/managers/scheduler.py +89 -70
sglang/srt/managers/session_controller.py +14 -15
sglang/srt/managers/tokenizer_manager.py +29 -22
sglang/srt/managers/tp_worker.py +13 -15
sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +20 -19
sglang/srt/model_executor/forward_batch_info.py +19 -17
sglang/srt/model_executor/model_runner.py +42 -30
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +15 -15
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +24 -19
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +13 -15
sglang/srt/models/llavavid.py +13 -15
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +21 -19
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +15 -17
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +13 -15
sglang/srt/openai_api/protocol.py +13 -15
sglang/srt/sampling/sampling_batch_info.py +4 -1
sglang/srt/sampling/sampling_params.py +13 -15
sglang/srt/server.py +59 -34
sglang/srt/server_args.py +22 -22
sglang/srt/utils.py +196 -17
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +13 -14
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
{sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +24 -15
sglang-0.3.6.post1.dist-info/RECORD +164 -0
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.6.dist-info/RECORD +0 -161
/sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +0 -0
{sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +0 -0
{sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Utilities for Huggingface Transformers."""
 import contextlib

sglang/srt/layers/activation.py CHANGED Viewed

@@ -1,16 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Fused operators for activation layers."""
 import logging

sglang/srt/layers/attention/flashinfer_backend.py CHANGED Viewed

@@ -7,6 +7,7 @@ FlashInfer is faster and Triton is easier to customize.
 Each backend supports two operators: extend (i.e. prefill with cached prefix) and decode.
 """
+import os
 from enum import Enum, auto
 from typing import TYPE_CHECKING, List
@@ -45,13 +46,19 @@ class FlashInferAttnBackend(AttentionBackend):
         super().__init__()
         # Parse constants
-        if not _grouped_size_compiled_for_decode_kernels(
-            model_runner.model_config.num_attention_heads // model_runner.tp_size,
-            model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
-        ):
-            self.decode_use_tensor_cores = True
+        if "SGLANG_FLASHINFER_USE_TENSOR_CORE" in os.environ:
+            self.decode_use_tensor_cores = (
+                os.environ["SGLANG_FLASHINFER_USE_TENSOR_CORE"].lower() == "true"
+            )
         else:
-            self.decode_use_tensor_cores = False
+            if not _grouped_size_compiled_for_decode_kernels(
+                model_runner.model_config.num_attention_heads // model_runner.tp_size,
+                model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
+            ):
+                self.decode_use_tensor_cores = True
+            else:
+                self.decode_use_tensor_cores = False
         self.max_context_len = model_runner.model_config.context_len
         assert not (

sglang/srt/layers/attention/triton_ops/decode_attention.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """
 Memory-efficient attention for decoding.
 It supports page size = 1.
@@ -26,6 +24,8 @@ import triton.language as tl
 from sglang.srt.utils import is_hip
+is_hip_ = is_hip()
 @triton.jit
 def tanh(x):
@@ -52,12 +52,13 @@ def _fwd_kernel_stage1(
     kv_group_num: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
+    SPLIT_K: tl.constexpr,
     logit_cap: tl.constexpr,
     Lk: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
-    start_n = tl.program_id(2)
+    split_k_id = tl.program_id(2)
     reduce_dtype = Att_Out.dtype.element_ty
     cur_kv_head = cur_head // kv_group_num
@@ -67,22 +68,18 @@ def _fwd_kernel_stage1(
     cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
     cur_batch_req_idx = tl.load(B_req_idx + cur_batch)
-    cur_batch_start_index = 0
-    cur_batch_end_index = cur_batch_seq_len
     off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+    q = tl.load(Q + off_q).to(reduce_dtype)
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, SPLIT_K)
+    split_k_start = kv_len_per_split * split_k_id
+    split_k_end = tl.minimum(split_k_start + kv_len_per_split, cur_batch_seq_len)
-    block_stard_index = start_n * BLOCK_N
-    block_mask = tl.where(block_stard_index < cur_batch_seq_len, 1, 0)
-    for start_mark in range(0, block_mask, 1):
-        q = tl.load(Q + off_q + start_mark).to(reduce_dtype)
-        offs_n_new = cur_batch_start_index + offs_n
+    for start_n in range(split_k_start, split_k_end, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
         k_loc = tl.load(
-            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n_new,
-            mask=offs_n_new < cur_batch_end_index,
+            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n,
+            mask=offs_n < split_k_end,
             other=0,
         )
         offs_buf_k = (
@@ -92,7 +89,7 @@ def _fwd_kernel_stage1(
         )
         k = tl.load(
             K_Buffer + offs_buf_k,
-            mask=(offs_n_new[:, None] < cur_batch_end_index) & (offs_d[None, :] < Lk),
+            mask=(offs_n[:, None] < split_k_end) & (offs_d[None, :] < Lk),
             other=0.0,
         ).to(reduce_dtype)
         att_value = tl.sum(q[None, :] * k, 1)
@@ -102,7 +99,7 @@ def _fwd_kernel_stage1(
             att_value = logit_cap * tanh(att_value / logit_cap)
         off_o = cur_head * att_stride_h + (cur_batch_in_all_start_index + offs_n)
-        tl.store(Att_Out + off_o, att_value, mask=offs_n_new < cur_batch_end_index)
+        tl.store(Att_Out + off_o, att_value, mask=offs_n < split_k_end)
 @triton.jit
@@ -191,11 +188,12 @@ def _decode_att_m_fwd(
     logit_cap,
 ):
     BLOCK = 32
+    SPLIT_K = 8
     Lk = k_buffer.shape[-1]
     batch, head_num = B_req_idx.shape[0], q.shape[1]
-    grid = (batch, head_num, triton.cdiv(max_len_in_batch, BLOCK))
+    grid = (batch, head_num, SPLIT_K)
     kv_group_num = q.shape[1] // k_buffer.shape[1]
     if kv_group_num == 1:
@@ -223,6 +221,7 @@ def _decode_att_m_fwd(
         kv_group_num=kv_group_num,
         BLOCK_DMODEL=BLOCK_DMODEL,
         BLOCK_N=BLOCK,
+        SPLIT_K=SPLIT_K,
         logit_cap=logit_cap,
         num_warps=num_warps,
         num_stages=1,
@@ -294,13 +293,14 @@ def _fwd_grouped_kernel_stage1(
     BLOCK_DPE: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_H: tl.constexpr,
+    SPLIT_K: tl.constexpr,
     logit_cap: tl.constexpr,
     Lk: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head_id = tl.program_id(1)
     cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
-    start_n = tl.program_id(2)
+    split_k_id = tl.program_id(2)
     reduce_dtype = Att_Out.dtype.element_ty
@@ -317,30 +317,27 @@ def _fwd_grouped_kernel_stage1(
     cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
     cur_batch_req_idx = tl.load(B_req_idx + cur_batch)
-    cur_batch_start_index = 0
-    cur_batch_end_index = cur_batch_seq_len
     offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :]
+    q = tl.load(
+        Q + offs_q, mask=(mask_h[:, None]) & (offs_d[None, :] < Lk), other=0.0
+    ).to(reduce_dtype)
     if BLOCK_DPE > 0:
         offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
         off_qpe = (
             cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_dpe[None, :]
         )
+        qpe = tl.load(Q + off_qpe, mask=mask_h[:, None], other=0.0).to(reduce_dtype)
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    block_stard_index = start_n * BLOCK_N
-    block_mask = tl.where(block_stard_index < cur_batch_seq_len, 1, 0)
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, SPLIT_K)
+    split_k_start = kv_len_per_split * split_k_id
+    split_k_end = tl.minimum(split_k_start + kv_len_per_split, cur_batch_seq_len)
-    for start_mark in range(0, block_mask, 1):
-        q = tl.load(
-            Q + offs_q + start_mark, mask=(mask_h[:, None]) & (offs_d[None, :] < Lk)
-        ).to(reduce_dtype)
-        offs_n_new = cur_batch_start_index + offs_n
+    for start_n in range(split_k_start, split_k_end, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
         k_loc = tl.load(
-            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n_new,
-            mask=offs_n_new < cur_batch_end_index,
+            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n,
+            mask=offs_n < split_k_end,
             other=0,
         )
         offs_buf_k = (
@@ -350,14 +347,11 @@ def _fwd_grouped_kernel_stage1(
         )
         k = tl.load(
             K_Buffer + offs_buf_k,
-            mask=(offs_n_new[None, :] < cur_batch_end_index) & (offs_d[:, None] < Lk),
+            mask=(offs_n[None, :] < split_k_end) & (offs_d[:, None] < Lk),
             other=0.0,
         ).to(reduce_dtype)
         qk = tl.dot(q, k)
         if BLOCK_DPE > 0:
-            qpe = tl.load(Q + off_qpe + start_mark, mask=mask_h[:, None]).to(
-                reduce_dtype
-            )
             offs_buf_kpe = (
                 k_loc[None, :] * stride_buf_kbs
                 + cur_kv_head * stride_buf_kh
@@ -365,7 +359,7 @@ def _fwd_grouped_kernel_stage1(
             )
             kpe = tl.load(
                 K_Buffer + offs_buf_kpe,
-                mask=offs_n_new[None, :] < cur_batch_end_index,
+                mask=offs_n[None, :] < split_k_end,
                 other=0.0,
             ).to(reduce_dtype)
             qk += tl.dot(qpe, kpe)
@@ -381,7 +375,7 @@ def _fwd_grouped_kernel_stage1(
         tl.store(
             Att_Out + offs_o,
             qk,
-            mask=mask_h[:, None] & (offs_n_new[None, :] < cur_batch_end_index),
+            mask=mask_h[:, None] & (offs_n[None, :] < split_k_end),
         )
@@ -499,16 +493,17 @@ def _decode_grouped_att_m_fwd(
     kv_group_num = q.shape[1] // k_buffer.shape[1]
     BLOCK_H = max(16, min(64, triton.next_power_of_2(kv_group_num)))
+    SPLIT_K = 8
     grid = (
         batch,
         triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
-        triton.cdiv(max_len_in_batch, BLOCK),
+        SPLIT_K,
     )
     num_warps = 4
     extra_kargs = {}
-    if is_hip():
+    if is_hip_:
         # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
         # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
         extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
@@ -534,6 +529,7 @@ def _decode_grouped_att_m_fwd(
         BLOCK_DPE=BLOCK_DPE,
         BLOCK_N=BLOCK,
         BLOCK_H=BLOCK_H,
+        SPLIT_K=SPLIT_K,
         logit_cap=logit_cap,
         num_warps=num_warps,
         num_stages=1,
@@ -563,7 +559,7 @@ def _decode_grouped_softmax_reducev_fwd(
     BLOCK_DMODEL = triton.next_power_of_2(Lv)
     extra_kargs = {}
-    if is_hip():
+    if is_hip_:
         # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
         # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
         extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}

sglang/srt/layers/attention/triton_ops/extend_attention.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """
 Memory-efficient attention for prefill.
 It supports page size = 1 and prefill with KV cache (i.e. extend).
@@ -31,6 +29,8 @@ is_cuda_available = torch.cuda.is_available()
 if is_cuda_available:
     CUDA_CAPABILITY = torch.cuda.get_device_capability()
+is_hip_ = is_hip()
 @triton.jit
 def tanh(x):
@@ -313,7 +313,7 @@ def extend_attention_fwd(
     num_stages = 1
     extra_kargs = {}
-    if is_hip():
+    if is_hip_:
         extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
     _fwd_kernel[grid](

sglang/srt/layers/attention/triton_ops/prefill_attention.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """
 Memory-efficient attention for prefill.
 It supporst page size = 1.

sglang/srt/layers/custom_op_util.py CHANGED Viewed

@@ -1,17 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 from vllm.model_executor.custom_op import CustomOp

sglang/srt/layers/fused_moe_grok/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from sglang.srt.layers.fused_moe_grok.layer import FusedMoE, FusedMoEMethodBase

sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py RENAMED Viewed

@@ -20,7 +20,7 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 from vllm.model_executor.utils import set_weight_attrs
-from sglang.srt.layers.fused_moe.fused_moe import padding_size
+from sglang.srt.layers.fused_moe_grok.fused_moe import padding_size
 from sglang.srt.utils import is_hip
 logger = init_logger(__name__)
@@ -123,7 +123,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         num_expert_group: Optional[int],
         topk_group: Optional[int],
     ) -> torch.Tensor:
-        from sglang.srt.layers.fused_moe.fused_moe import fused_moe
+        from sglang.srt.layers.fused_moe_grok.fused_moe import fused_moe
         return fused_moe(
             x,
@@ -153,12 +153,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         num_expert_group: Optional[int],
         topk_group: Optional[int],
     ) -> torch.Tensor:
-        from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe
-        assert not use_grouped_topk
-        assert num_expert_group is None
-        assert topk_group is None
-        return fused_moe(x, w1, w2, router_logits, top_k, renormalize)
+        raise NotImplementedError("The TPU backend currently does not support MoE.")
 class FusedMoE(torch.nn.Module):
@@ -614,7 +609,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         topk_group: Optional[int] = None,
     ) -> torch.Tensor:
-        from sglang.srt.layers.fused_moe.fused_moe import fused_moe
+        from sglang.srt.layers.fused_moe_grok.fused_moe import fused_moe
         return fused_moe(
             x,

sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} RENAMED Viewed

@@ -1,3 +1,8 @@
+"""
+Torch-native implementation for FusedMoE. This is used for torch.compile.
+It is based on https://github.com/pytorch-labs/gpt-fast/blob/32971d3129541c5bfb4f715abc33d1c5f408d204/mixtral-moe/model.py#L204
+"""
 from typing import Callable, Optional
 import torch

sglang/srt/layers/fused_moe_triton/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+from contextlib import contextmanager
+from typing import Any, Dict, Optional
+import sglang.srt.layers.fused_moe_triton.fused_moe  # noqa
+from sglang.srt.layers.fused_moe_triton.fused_moe import (
+    fused_experts,
+    fused_topk,
+    get_config_file_name,
+    grouped_topk,
+)
+from sglang.srt.layers.fused_moe_triton.layer import (
+    FusedMoE,
+    FusedMoEMethodBase,
+    FusedMoeWeightScaleSupported,
+)
+_config: Optional[Dict[str, Any]] = None
+@contextmanager
+def override_config(config):
+    global _config
+    old_config = _config
+    _config = config
+    yield
+    _config = old_config
+def get_config() -> Optional[Dict[str, Any]]:
+    return _config
+__all__ = [
+    "FusedMoE",
+    "FusedMoEMethodBase",
+    "FusedMoeWeightScaleSupported",
+    "override_config",
+    "get_config",
+    "fused_moe",
+    "fused_topk",
+    "fused_experts",
+    "get_config_file_name",
+    "grouped_topk",
+]

sglang 0.3.6__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

sglang 0.3.6py3-none-any.whl → 0.3.6.post1py3-none-any.whl