PyPI - sglang - Versions diffs - 0.4.6.post3__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl - Mend

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

sglang/bench_offline_throughput.py +4 -2
sglang/bench_one_batch.py +2 -2
sglang/bench_one_batch_server.py +143 -15
sglang/bench_serving.py +9 -7
sglang/compile_deep_gemm.py +1 -1
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +78 -78
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +2 -2
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +69 -43
sglang/srt/conversation.py +48 -43
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +7 -2
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +227 -120
sglang/srt/disaggregation/nixl/conn.py +1 -0
sglang/srt/disaggregation/prefill.py +7 -4
sglang/srt/disaggregation/utils.py +7 -1
sglang/srt/entrypoints/engine.py +17 -2
sglang/srt/entrypoints/http_server.py +17 -2
sglang/srt/function_call_parser.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +1 -1
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/utils.py +4 -2
sglang/srt/layers/dp_attention.py +71 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +72 -71
sglang/srt/layers/quantization/fp8.py +2 -2
sglang/srt/layers/quantization/fp8_kernel.py +3 -3
sglang/srt/layers/quantization/int8_kernel.py +2 -2
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/io_struct.py +3 -1
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/schedule_batch.py +76 -24
sglang/srt/managers/schedule_policy.py +0 -3
sglang/srt/managers/scheduler.py +113 -88
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/tokenizer_manager.py +133 -34
sglang/srt/managers/tp_worker.py +12 -9
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/memory_pool.py +2 -0
sglang/srt/metrics/collector.py +312 -37
sglang/srt/model_executor/cuda_graph_runner.py +10 -11
sglang/srt/model_executor/forward_batch_info.py +1 -1
sglang/srt/model_executor/model_runner.py +19 -14
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +23 -20
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llama4.py +5 -6
sglang/srt/models/llava.py +248 -5
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/openai_api/adapter.py +30 -4
sglang/srt/openai_api/protocol.py +0 -8
sglang/srt/reasoning_parser.py +3 -3
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +4 -56
sglang/srt/sampling/sampling_params.py +2 -2
sglang/srt/server_args.py +34 -4
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +7 -7
sglang/srt/speculative/eagle_worker.py +22 -19
sglang/srt/utils.py +6 -5
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +89 -14
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +6 -5
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +107 -104
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -49,6 +49,7 @@ from sglang.srt.disaggregation.utils import (
 from sglang.srt.entrypoints.engine import _launch_subprocesses
 from sglang.srt.function_call_parser import FunctionCallParser
 from sglang.srt.managers.io_struct import (
+    AbortReq,
     CloseSessionReqInput,
     ConfigureLoggingReq,
     EmbeddingReqInput,
@@ -221,7 +222,7 @@ async def get_server_info():
     return {
         **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
         **_global_state.scheduler_info,
-        **internal_states,
+        "internal_states": internal_states,
         "version": __version__,
     }
@@ -337,7 +338,11 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
         obj = ProfileReqInput()
     await _global_state.tokenizer_manager.start_profile(
-        obj.output_dir, obj.num_steps, obj.activities
+        output_dir=obj.output_dir,
+        num_steps=obj.num_steps,
+        activities=obj.activities,
+        with_stack=obj.with_stack,
+        record_shapes=obj.record_shapes,
     )
     return Response(
         content="Start profiling.\n",
@@ -539,6 +544,16 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
     return Response(status_code=200)
+@app.post("/abort_request")
+async def abort_request(obj: AbortReq, request: Request):
+    """Abort a request."""
+    try:
+        _global_state.tokenizer_manager.abort_request(rid=obj.rid)
+        return Response(status_code=200)
+    except Exception as e:
+        return _create_error_response(e)
 @app.post("/parse_function_call")
 async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
     """

sglang/srt/function_call_parser.py CHANGED Viewed

@@ -86,8 +86,8 @@ class StructureInfo:
 _GetInfoFunc = Callable[[str], StructureInfo]
 """
-helper alias of function
-ususally it is a function that takes a name string and returns a StructureInfo object,
+Helper alias of function
+Usually it is a function that takes a name string and returns a StructureInfo object,
 which can be used to construct a structural_tag object
 """

sglang/srt/layers/attention/flashattention_backend.py CHANGED Viewed

@@ -308,7 +308,7 @@ class FlashAttentionBackend(AttentionBackend):
         ), "Sliding window and cross attention are not supported together"
         self.forward_metadata: FlashAttentionMetadata = None
-        # extra metdata for handling speculative decoding topk > 1, extended draft decode and verify
+        # extra metadata for handling speculative decoding topk > 1, extended draft decode and verify
         self.forward_metadata_spec_decode_expand: FlashAttentionMetadata = None
         self.max_context_len = model_runner.model_config.context_len
         self.device = model_runner.device

sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py CHANGED Viewed

@@ -919,7 +919,7 @@ def _fwd_kernel(
         e_max = n_e_max
-    # stage 2: compute the trianlge part
+    # stage 2: compute the triangle part
     cur_block_m_end = tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
     for start_n in range(0, cur_block_m_end, BLOCK_N):

sglang/srt/layers/attention/utils.py CHANGED Viewed

@@ -28,7 +28,8 @@ def create_flashinfer_kv_indices_triton(
     num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
     for i in range(num_loop):
-        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        # index into req_to_token_ptr needs to be int64
+        offset = tl.arange(0, BLOCK_SIZE).to(tl.int64) + i * BLOCK_SIZE
         mask = offset < kv_end - kv_start
         data = tl.load(
             req_to_token_ptr
@@ -70,8 +71,9 @@ def create_flashmla_kv_indices_triton(
     num_pages_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
     for i in range(num_pages_loop):
+        # index into req_to_token_ptr needs to be int64
         paged_offset = (
-            tl.arange(0, NUM_PAGE_PER_BLOCK) + i * NUM_PAGE_PER_BLOCK
+            tl.arange(0, NUM_PAGE_PER_BLOCK).to(tl.int64) + i * NUM_PAGE_PER_BLOCK
         ) * PAGED_SIZE
         paged_offset_out = tl.arange(0, NUM_PAGE_PER_BLOCK) + i * NUM_PAGE_PER_BLOCK

sglang/srt/layers/dp_attention.py CHANGED Viewed

@@ -24,8 +24,10 @@ if TYPE_CHECKING:
 _ATTN_TP_GROUP = None
 _ATTN_TP_RANK = None
 _ATTN_TP_SIZE = None
-_DP_RANK = None
-_DP_SIZE = None
+_ATTN_DP_RANK = None
+_ATTN_DP_SIZE = None
+_LOCAL_ATTN_DP_SIZE = None
+_LOCAL_ATTN_DP_RANK = None
 def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_size):
@@ -33,9 +35,27 @@ def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_si
         return tp_rank, tp_size, 0
     attn_tp_size = tp_size // dp_size
-    dp_rank = tp_rank // attn_tp_size
+    attn_dp_rank = tp_rank // attn_tp_size
     attn_tp_rank = tp_rank % attn_tp_size
-    return attn_tp_rank, attn_tp_size, dp_rank
+    return attn_tp_rank, attn_tp_size, attn_dp_rank
+def compute_dp_attention_local_info(
+    enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size
+):
+    if not enable_dp_attention:
+        return tp_rank, tp_size, 0
+    local_tp_size = moe_dense_tp_size if moe_dense_tp_size else tp_size
+    local_tp_rank = tp_rank % local_tp_size
+    local_dp_size = max(1, dp_size // (tp_size // local_tp_size))
+    local_attn_tp_size = local_tp_size // local_dp_size
+    local_attn_dp_rank = local_tp_rank // local_attn_tp_size
+    local_attn_tp_rank = local_tp_rank % local_attn_tp_size
+    return local_attn_tp_rank, local_attn_tp_size, local_attn_dp_rank
 def initialize_dp_attention(
@@ -43,22 +63,32 @@ def initialize_dp_attention(
     tp_rank: int,
     tp_size: int,
     dp_size: int,
+    moe_dense_tp_size: int,
     pp_size: int,
 ):
-    global _ATTN_TP_GROUP, _ATTN_TP_RANK, _ATTN_TP_SIZE, _DP_RANK, _DP_SIZE
+    global _ATTN_TP_GROUP, _ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK, _ATTN_DP_SIZE
+    global _LOCAL_ATTN_DP_SIZE, _LOCAL_ATTN_DP_RANK
     from sglang.srt.layers.sampler import SYNC_TOKEN_IDS_ACROSS_TP
-    _ATTN_TP_RANK, _ATTN_TP_SIZE, _DP_RANK = compute_dp_attention_world_info(
+    _ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK = compute_dp_attention_world_info(
         enable_dp_attention, tp_rank, tp_size, dp_size
     )
+    _, _, _LOCAL_ATTN_DP_RANK = compute_dp_attention_local_info(
+        enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size
+    )
     if enable_dp_attention:
         local_rank = tp_rank % (tp_size // dp_size)
-        _DP_SIZE = dp_size
+        _ATTN_DP_SIZE = dp_size
+        if moe_dense_tp_size is None:
+            _LOCAL_ATTN_DP_SIZE = _ATTN_DP_SIZE
+        else:
+            _LOCAL_ATTN_DP_SIZE = max(1, dp_size // (tp_size // moe_dense_tp_size))
     else:
         local_rank = tp_rank
-        _DP_SIZE = 1
+        _ATTN_DP_SIZE = 1
+        _LOCAL_ATTN_DP_SIZE = 1
     tp_group = get_tp_group()
     _ATTN_TP_GROUP = GroupCoordinator(
@@ -93,13 +123,33 @@ def get_attention_tp_size():
 def get_attention_dp_rank():
-    assert _DP_RANK is not None, "dp attention not initialized!"
-    return _DP_RANK
+    assert _ATTN_DP_RANK is not None, "dp attention not initialized!"
+    return _ATTN_DP_RANK
 def get_attention_dp_size():
-    assert _DP_SIZE is not None, "dp attention not initialized!"
-    return _DP_SIZE
+    assert _ATTN_DP_SIZE is not None, "dp attention not initialized!"
+    return _ATTN_DP_SIZE
+def get_local_attention_dp_rank():
+    assert _LOCAL_ATTN_DP_RANK is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_RANK
+def get_local_attention_dp_size():
+    assert _LOCAL_ATTN_DP_SIZE is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_SIZE
+def get_local_attention_dp_rank():
+    assert _LOCAL_ATTN_DP_RANK is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_RANK
+def get_local_attention_dp_size():
+    assert _LOCAL_ATTN_DP_SIZE is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_SIZE
 @contextmanager
@@ -112,19 +162,19 @@ def disable_dp_size():
     Args:
         tp_group (GroupCoordinator): the tp group coordinator
     """
-    global _DP_SIZE
-    assert _DP_SIZE is not None, "dp attention not initialized!"
+    global _ATTN_DP_SIZE
+    assert _ATTN_DP_SIZE is not None, "dp attention not initialized!"
-    old_dp_size = _DP_SIZE
-    _DP_SIZE = 1
+    old_dp_size = _ATTN_DP_SIZE
+    _ATTN_DP_SIZE = 1
     try:
         yield
     finally:
-        _DP_SIZE = old_dp_size
+        _ATTN_DP_SIZE = old_dp_size
 def get_dp_local_info(forward_batch: ForwardBatch):
-    dp_rank = get_attention_dp_rank()
+    dp_rank = get_local_attention_dp_rank()
     if forward_batch.dp_local_start_pos is None:
         cumtokens = torch.cumsum(forward_batch.global_num_tokens_gpu, dim=0)
@@ -201,7 +251,7 @@ def _dp_gather(
             global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
         )
-    # Input IDs are in int 32. We should use inplace_all_reduce for local case becaues of custom all reduce.
+    # Input IDs are in int 32. We should use inplace_all_reduce for local case because of custom all reduce.
     NUM_GPUS_PER_NODE = 8
     if (
         not local_tokens.dtype.is_floating_point
@@ -252,12 +302,12 @@ def dp_scatter(
         )
-def tp_reduce_scatter(
+def attn_tp_reduce_scatter(
     output: torch.Tensor,
     input_list: List[torch.Tensor],
 ):
     return get_attention_tp_group().reduce_scatter(output, input_list)
-def tp_all_gather(output_list: List[torch.Tensor], input_: torch.Tensor):
+def attn_tp_all_gather(output_list: List[torch.Tensor], input_: torch.Tensor):
     return get_attention_tp_group().all_gather(input_, tensor_list=output_list)

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -76,7 +76,7 @@ class RMSNorm(CustomOp):
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         if not x.is_contiguous():
-            # NOTE: Romove this if aiter kernel supports discontinuous input
+            # NOTE: Remove this if aiter kernel supports discontinuous input
             x = x.contiguous()
         if residual is not None:
             fused_add_rms_norm(x, residual, self.weight.data, self.variance_epsilon)

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -23,15 +23,17 @@ import triton.language as tl
 from torch import nn
 from sglang.srt.distributed import (
-    get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_gather,
 )
 from sglang.srt.layers.dp_attention import (
+    attn_tp_all_gather,
     dp_gather_replicate,
     dp_scatter,
-    get_attention_dp_rank,
     get_attention_dp_size,
+    get_attention_tp_size,
+    get_local_attention_dp_rank,
+    get_local_attention_dp_size,
 )
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -45,6 +47,18 @@ from sglang.srt.utils import dump_to_file
 logger = logging.getLogger(__name__)
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.utils import dump_to_file
+logger = logging.getLogger(__name__)
 @dataclasses.dataclass
 class LogitsProcessorOutput:
     ## Part 1: This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
@@ -169,7 +183,7 @@ class LogitsMetadata:
             return
         cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0)
-        dp_rank = get_attention_dp_rank()
+        dp_rank = get_local_attention_dp_rank()
         if dp_rank == 0:
             dp_local_start_pos = torch.zeros_like(
                 self.global_num_tokens_for_logprob_gpu[0]
@@ -198,12 +212,20 @@ class LogitsProcessor(nn.Module):
         super().__init__()
         self.config = config
         self.logit_scale = logit_scale
-        self.do_tensor_parallel_all_gather = (
-            not skip_all_gather and get_tensor_model_parallel_world_size() > 1
-        )
-        self.do_tensor_parallel_all_gather_dp_attn = (
-            self.do_tensor_parallel_all_gather and get_attention_dp_size() != 1
-        )
+        self.use_attn_tp_group = global_server_args_dict["enable_dp_lm_head"]
+        if self.use_attn_tp_group:
+            self.attn_tp_size = get_attention_tp_size()
+            self.do_tensor_parallel_all_gather = (
+                not skip_all_gather and self.attn_tp_size > 1
+            )
+            self.do_tensor_parallel_all_gather_dp_attn = False
+        else:
+            self.do_tensor_parallel_all_gather = (
+                not skip_all_gather and get_tensor_model_parallel_world_size() > 1
+            )
+            self.do_tensor_parallel_all_gather_dp_attn = (
+                self.do_tensor_parallel_all_gather and get_attention_dp_size() != 1
+            )
         self.final_logit_softcapping = getattr(
             self.config, "final_logit_softcapping", None
         )
@@ -315,7 +337,8 @@ class LogitsProcessor(nn.Module):
         if self.debug_tensor_dump_output_folder:
             assert (
-                not self.do_tensor_parallel_all_gather or get_attention_dp_size() == 1
+                not self.do_tensor_parallel_all_gather
+                or get_local_attention_dp_size() == 1
             ), "dp attention + sharded lm_head doesn't support full logits"
             full_logits = self._get_logits(hidden_states, lm_head, logits_metadata)
             dump_to_file(self.debug_tensor_dump_output_folder, "logits", full_logits)
@@ -442,7 +465,19 @@ class LogitsProcessor(nn.Module):
             logits.mul_(self.logit_scale)
         if self.do_tensor_parallel_all_gather:
-            logits = tensor_model_parallel_all_gather(logits)
+            if self.use_attn_tp_group:
+                global_logits = torch.empty(
+                    (self.config.vocab_size, logits.shape[0]),
+                    device=logits.device,
+                    dtype=logits.dtype,
+                )
+                global_logits = global_logits.T
+                attn_tp_all_gather(
+                    list(global_logits.tensor_split(self.attn_tp_size, dim=-1)), logits
+                )
+                logits = global_logits
+            else:
+                logits = tensor_model_parallel_all_gather(logits)
         if self.do_tensor_parallel_all_gather_dp_attn:
             logits, global_logits = (

sglang/srt/layers/moe/ep_moe/kernels.py CHANGED Viewed

@@ -116,7 +116,7 @@ def deepep_run_moe_deep_preprocess(topk_ids: torch.Tensor, num_experts: int):
     seg_indptr = torch.empty(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
     src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int64)
-    # Find offet
+    # Find offset
     expert_ids = torch.arange(
         num_experts + 1, device=topk_ids.device, dtype=reorder_topk_ids.dtype
     )

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -611,7 +611,7 @@ class Fp8EPMoEMethod(Fp8MoEMethod):
                 self.quant_config.weight_block_size[1],
             )
             # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
-            # Required by collum parallel or enabling merged weights
+            # Required by column parallel or enabling merged weights
             if intermediate_size % block_n != 0:
                 raise ValueError(
                     f"The output_size of gate's and up's weight = "

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -994,7 +994,7 @@ def get_default_config(
                     "num_stages": 2 if _is_hip else 4,
                 }
         else:
-            # Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
+            # Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
             config = {
                 "BLOCK_SIZE_M": 64,
                 "BLOCK_SIZE_N": block_shape[0],

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -270,7 +270,7 @@ def select_experts(
     routed_scaling_factor: Optional[float] = None,
 ):
     n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
-    # DeekSeek V2/V3/R1 serices models uses grouped_top_k
+    # DeepSeek V2/V3/R1 series models use grouped_top_k
     if use_grouped_topk:
         assert topk_group is not None
         assert num_expert_group is not None

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
         raise ValueError(
             f"{quantization} quantization requires some operators from vllm. "
-            "Pleaes install vllm by `pip install vllm==0.8.4`"
+            "Please install vllm by `pip install vllm==0.8.4`"
         )
     return QUANTIZATION_METHODS[quantization]

sglang/srt/layers/quantization/blockwise_int8.py CHANGED Viewed

@@ -152,7 +152,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
                     f"{input_size_per_partition} is not divisible by "
                     f"weight quantization block_k = {block_k}."
                 )
-        # Required by collum parallel or enabling merged weights
+        # Required by column parallel or enabling merged weights
         if (tp_size > 1 and output_size // output_size_per_partition == tp_size) or len(
             output_partition_sizes
         ) > 1:
@@ -285,7 +285,7 @@ class BlockInt8MoEMethod:
             self.quant_config.weight_block_size[1],
         )
         # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
-        # Required by collum parallel or enabling merged weights
+        # Required by column parallel or enabling merged weights
         if intermediate_size % block_n != 0:
             raise ValueError(
                 f"The output_size of gate's and up's weight = "

sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post4py3-none-any.whl