PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_offline_throughput.py +4 -2
sglang/bench_one_batch.py +3 -13
sglang/bench_one_batch_server.py +143 -15
sglang/bench_serving.py +158 -8
sglang/compile_deep_gemm.py +1 -1
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +119 -75
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +5 -2
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +18 -0
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +71 -53
sglang/srt/conversation.py +78 -46
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +11 -3
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +236 -138
sglang/srt/disaggregation/nixl/conn.py +242 -71
sglang/srt/disaggregation/prefill.py +7 -4
sglang/srt/disaggregation/utils.py +51 -2
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +31 -4
sglang/srt/entrypoints/http_server.py +45 -3
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/function_call_parser.py +2 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +147 -51
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/utils.py +4 -2
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/dp_attention.py +71 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
sglang/srt/layers/moe/ep_moe/layer.py +121 -2
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +77 -71
sglang/srt/layers/quantization/fp8.py +110 -97
sglang/srt/layers/quantization/fp8_kernel.py +81 -62
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/int8_kernel.py +2 -2
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +11 -14
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/io_struct.py +13 -1
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/schedule_batch.py +93 -23
sglang/srt/managers/schedule_policy.py +11 -8
sglang/srt/managers/scheduler.py +140 -100
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/tokenizer_manager.py +157 -47
sglang/srt/managers/tp_worker.py +21 -21
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +4 -2
sglang/srt/metrics/collector.py +312 -37
sglang/srt/model_executor/cuda_graph_runner.py +10 -11
sglang/srt/model_executor/forward_batch_info.py +1 -1
sglang/srt/model_executor/model_runner.py +57 -41
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +3 -3
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +77 -39
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +3 -1
sglang/srt/models/llama4.py +58 -13
sglang/srt/models/llava.py +248 -5
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +52 -42
sglang/srt/openai_api/protocol.py +20 -16
sglang/srt/reasoning_parser.py +1 -1
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +2 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +64 -10
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +7 -7
sglang/srt/speculative/eagle_worker.py +22 -19
sglang/srt/utils.py +41 -6
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +92 -15
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -14,6 +14,9 @@ except ImportError:
 from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
 from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    fp8_max,
+    is_fp8_fnuz,
     per_token_group_quant_fp8,
     scaled_fp8_quant,
     sglang_per_token_quant_fp8,
@@ -30,8 +33,11 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_cuda = is_cuda()
+_is_fp8_fnuz = is_fp8_fnuz()
-if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
+use_aiter_moe = get_bool_env_var("SGLANG_AITER_MOE")
+if _is_hip and use_aiter_moe:
     from aiter import gemm_a8w8_blockscale
 if _is_cuda:
@@ -43,19 +49,23 @@ use_vllm_cutlass_w8a8_fp8_kernel = get_bool_env_var("USE_VLLM_CUTLASS_W8A8_FP8_K
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
 TORCH_DEVICE_IDENTITY = None
-_TORCH_VERSION = torch.__version__.split("+")[0]
-try:
-    _TORCH_VERSION_TUPLE = tuple(map(int, _TORCH_VERSION.split(".")[:3]))
-except ValueError:
-    _TORCH_VERSION_TUPLE = (0, 0, 0)
-# The condition to determine if it is on a platform that supports
-# torch._scaled_mm rowwise feature.
-# The condition is determined once as the operations
-# are time consuming.
-USE_ROWWISE_TORCH_SCALED_MM = (
-    _is_hip and get_device_capability() >= (9, 4) and _TORCH_VERSION_TUPLE >= (2, 7, 0)
-)
+def use_rowwise_torch_scaled_mm():
+    _TORCH_VERSION = torch.__version__.split("+")[0]
+    try:
+        _TORCH_VERSION_TUPLE = tuple(map(int, _TORCH_VERSION.split(".")[:3]))
+    except ValueError:
+        _TORCH_VERSION_TUPLE = (0, 0, 0)
+    if _is_hip:
+        # The condition to determine if it is on a platform that supports
+        # torch._scaled_mm rowwise feature.
+        # The condition is determined once as the operations
+        # are time consuming.
+        return get_device_capability() >= (9, 4) and _TORCH_VERSION_TUPLE >= (2, 7, 0)
+    return False
+USE_ROWWISE_TORCH_SCALED_MM = use_rowwise_torch_scaled_mm()
 def cutlass_fp8_supported():
@@ -132,7 +142,7 @@ def apply_w8a8_block_fp8_linear(
         output = fp8_blockwise_scaled_mm(
             q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
         )
-    elif _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
+    elif _is_hip and use_aiter_moe:
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=False
         )
@@ -164,18 +174,21 @@ def apply_w8a8_block_fp8_linear(
 def input_to_float8(
-    x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
+    x: torch.Tensor, dtype: torch.dtype = fp8_dtype
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """This function quantizes input values to float8 values with tensor-wise quantization."""
-    finfo = torch.finfo(dtype)
     min_val, max_val = x.aminmax()
     amax = torch.maximum(min_val.abs(), max_val.abs()).float().clamp(min=1e-12)
-    fp8_max = finfo.max
-    if _is_hip:
-        dtype = torch.float8_e4m3fnuz
-        fp8_max = 224.0
-    scale = fp8_max / amax
-    x_scl_sat = (x.float() * scale).clamp(min=-fp8_max, max=fp8_max)
+    if _is_fp8_fnuz:
+        dtype = fp8_dtype
+        fp_max = fp8_max
+    else:
+        finfo = torch.finfo(dtype)
+        fp_max = finfo.max
+    scale = fp_max / amax
+    x_scl_sat = (x.float() * scale).clamp(min=-fp_max, max=fp_max)
     return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
@@ -222,6 +235,41 @@ def block_quant_to_tensor_quant(
     return x_q_tensor, scale
+def block_quant_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """This function converts block-wise quantization to unquantized.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The output is an unquantized tensor with dtype.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+    x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
+    for j in range(n_tiles):
+        for i in range(k_tiles):
+            x_q_block_tile = x_q_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            x_dq_block_tile = x_dq_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
+    return x_dq_block
 def channel_quant_to_tensor_quant(
     x_q_channel: torch.Tensor,
     x_s: torch.Tensor,

sglang/srt/layers/quantization/int8_kernel.py CHANGED Viewed

@@ -76,7 +76,7 @@ def _per_token_group_quant_int8(
     y_s_ptr,
     # Stride of input
     y_stride,
-    # Collums of input
+    # Columns of input
     N,
     # Avoid to divide zero
     eps,
@@ -370,7 +370,7 @@ def w8a8_block_int8_matmul(
         config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
     else:
         # Default config
-        # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
         config = {
             "BLOCK_SIZE_M": 64,
             "BLOCK_SIZE_N": block_size[0],

sglang/srt/layers/quantization/kv_cache.py CHANGED Viewed

@@ -8,10 +8,8 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.utils import is_hip
-_is_hip = is_hip()
 logger = logging.getLogger(__name__)
@@ -44,11 +42,6 @@ class BaseKVCacheMethod(QuantizeMethodBase):
             torch.tensor(-1.0, dtype=torch.float32), requires_grad=False
         )
-    @classmethod
-    def is_fp8_fnuz(cls) -> bool:
-        # only device 0 is checked, this assumes MI300 platforms are homogeneous
-        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     def apply(self, layer: torch.nn.Module) -> torch.Tensor:
         raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.")
@@ -57,7 +50,7 @@ class BaseKVCacheMethod(QuantizeMethodBase):
             # We prefer to use separate k_scale and v_scale if present
             k_scale = layer.k_scale.to("cpu").tolist()
             v_scale = layer.v_scale.to("cpu").tolist()
-            if _is_hip and self.is_fp8_fnuz():
+            if is_fp8_fnuz():
                 k_scale *= 2
                 v_scale *= 2
         elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
@@ -73,7 +66,7 @@ class BaseKVCacheMethod(QuantizeMethodBase):
             scale_to_duplicate = max(layer.k_scale, layer.v_scale)
             k_scale = scale_to_duplicate.to("cpu").tolist()
             v_scale = scale_to_duplicate.to("cpu").tolist()
-            if _is_hip and self.is_fp8_fnuz():
+            if is_fp8_fnuz():
                 k_scale *= 2
                 v_scale *= 2

sglang/srt/layers/quantization/utils.py CHANGED Viewed

@@ -14,11 +14,6 @@ if not _is_cuda:
     from vllm._custom_ops import scaled_fp8_quant
-def is_fp8_fnuz() -> bool:
-    # only device 0 is checked, this assumes MI300 platforms are homogeneous
-    return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
 def is_layer_skipped(
     prefix: str,
     ignored_layers: List[str],

sglang/srt/layers/quantization/w8a8_fp8.py CHANGED Viewed

@@ -9,16 +9,20 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
+from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    is_fp8_fnuz,
+    per_token_group_quant_fp8,
+)
 from sglang.srt.layers.quantization.fp8_utils import (
     apply_fp8_linear,
     cutlass_fp8_supported,
     input_to_float8,
     normalize_e4m3fn_to_e4m3fnuz,
 )
-from sglang.srt.utils import is_hip, set_weight_attrs
+from sglang.srt.utils import set_weight_attrs
-_is_hip = is_hip()
+_is_fp8_fnuz = is_fp8_fnuz()
 class W8A8Fp8Config(QuantizationConfig):
@@ -97,7 +101,7 @@ class W8A8Fp8LinearMethod(LinearMethodBase):
         if self.quantization_config.is_checkpoint_fp8_serialized:
             weight_scale = layer.weight_scale.detach()
             # If checkpoint offline quantized with w8a8_fp8, load the weight and weight_scale directly.
-            if _is_hip:
+            if _is_fp8_fnuz:
                 weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                     weight=weight, weight_scale=weight_scale
                 )
@@ -113,14 +117,9 @@ class W8A8Fp8LinearMethod(LinearMethodBase):
                     layer.weight, layer.weight.shape[-1]
                 )
                 weight_scale = weight_scale.t().contiguous()
-                if _is_hip:
-                    weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
-                        weight=weight, weight_scale=weight_scale
-                    )
             else:
                 # if cutlass not supported, we fall back to use torch._scaled_mm
                 # which requires per tensor quantization on weight
-                fp8_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
                 qweight, weight_scale = input_to_float8(layer.weight, dtype=fp8_dtype)
             # Update the layer with the new values.
@@ -227,7 +226,6 @@ class W8A8FP8MoEMethod:
     ):
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
-        fp8_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
         # WEIGHTS
         w13_weight = torch.nn.Parameter(
             torch.empty(

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -239,10 +239,6 @@ def top_p_normalize_probs_torch(
 def get_top_logprobs(logprobs: torch.Tensor, top_logprobs_nums: List[int]):
-    assert len(top_logprobs_nums) == logprobs.shape[0], (
-        len(top_logprobs_nums),
-        logprobs.shape[0],
-    )
     max_k = max(top_logprobs_nums)
     ret = logprobs.topk(max_k, dim=1)
     values = ret.values.tolist()

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -13,6 +13,7 @@ from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
 from sglang.srt.layers.parameter import BasevLLMParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
@@ -214,12 +215,14 @@ class VocabParallelEmbedding(torch.nn.Module):
         self,
         num_embeddings: int,
         embedding_dim: int,
+        *,
         params_dtype: Optional[torch.dtype] = None,
         org_num_embeddings: Optional[int] = None,
         padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         enable_tp: bool = True,
+        use_attn_tp_group: bool = False,
         use_presharded_weights: bool = False,
     ):
         super().__init__()
@@ -227,9 +230,14 @@ class VocabParallelEmbedding(torch.nn.Module):
         self.enable_tp = enable_tp
         if self.enable_tp:
-            tp_rank = get_tensor_model_parallel_rank()
-            self.tp_size = get_tensor_model_parallel_world_size()
+            if use_attn_tp_group:
+                tp_rank = get_attention_tp_rank()
+                self.tp_size = get_attention_tp_size()
+            else:
+                tp_rank = get_tensor_model_parallel_rank()
+                self.tp_size = get_tensor_model_parallel_world_size()
         else:
+            assert use_attn_tp_group is False
             tp_rank = 0
             self.tp_size = 1
@@ -519,22 +527,25 @@ class ParallelLMHead(VocabParallelEmbedding):
         self,
         num_embeddings: int,
         embedding_dim: int,
+        *,
         bias: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         org_num_embeddings: Optional[int] = None,
         padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_attn_tp_group: bool = False,
         use_presharded_weights: bool = False,
     ):
         super().__init__(
             num_embeddings,
             embedding_dim,
-            params_dtype,
-            org_num_embeddings,
-            padding_size,
-            quant_config,
-            prefix,
+            params_dtype=params_dtype,
+            org_num_embeddings=org_num_embeddings,
+            padding_size=padding_size,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_attn_tp_group=use_attn_tp_group,
             use_presharded_weights=use_presharded_weights,
         )
         self.quant_config = quant_config

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -100,7 +100,7 @@ class LoRAManager:
             self.configs[name] = LoRAConfig(path)
             self.hf_target_names.update(self.configs[name].target_modules)
-        # Target lora weight names for lora_a and lora_b modules repectively.
+        # Target lora weight names for lora_a and lora_b modules respectively.
         # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
         self.lora_weight_names: Set[Tuple[str]] = set(
             [get_stacked_name(module) for module in self.hf_target_names]
@@ -156,18 +156,15 @@ class LoRAManager:
         # set up batch info shared by all lora modules
         bs = forward_batch.batch_size
-        if hasattr(self, "max_bs_in_cuda_graph") and bs <= self.max_bs_in_cuda_graph:
-            # Do in-place updates when CUDA graph is enabled. Note that
-            # if CUDA graph is enabled, the batch whose bs <= max_bs_in_cuda_graph
-            # will also use these preallocated buffers, no matter whether
-            # the batch can use CUDA graph or not.
+        if (
+            hasattr(self, "max_bs_in_cuda_graph")
+            and bs <= self.max_bs_in_cuda_graph
+            and forward_batch.forward_mode.is_cuda_graph()
+        ):
+            # Do in-place updates when CUDA graph is enabled and the batch forward mode
+            # could use CUDA graph.
             self.cuda_graph_batch_info.bs = bs
-            if forward_batch.forward_mode.is_extend():
-                self.cuda_graph_batch_info.seg_lens[:bs].copy_(
-                    forward_batch.extend_seq_lens
-                )
-            else:
-                self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
+            self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
             torch.cumsum(
                 self.cuda_graph_batch_info.seg_lens[:bs],
                 dim=0,
@@ -201,10 +198,10 @@ class LoRAManager:
             max_len = int(torch.max(seg_lens))
             weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
-            lora_ranks = torch.empty(
+            lora_ranks = torch.zeros(
                 (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
             )
-            scalings = torch.empty(
+            scalings = torch.zeros(
                 (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
             )
             for i, lora_path in enumerate(forward_batch.lora_paths):

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -50,15 +50,15 @@ class LoRAMemoryPool:
         self.uid_to_buffer_id: Dict[Optional[str], int] = {}
         # Buffer idx -> lora uid in memory pool
-        # All uids are initalized as empty strings for empty buffer slots
-        # Here we don't initalize to None since None is a valid uid
+        # All uids are initialized as empty strings for empty buffer slots
+        # Here we don't initialize to None since None is a valid uid
         self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
     def get_lora_A_shape(
         self, module_name: str, base_model: torch.nn.Module
     ) -> Tuple[int]:
         """
-        Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+        Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
         """
         input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
         c = get_stacked_multiply(module_name)
@@ -75,7 +75,7 @@ class LoRAMemoryPool:
         self, module_name: str, base_model: torch.nn.Module
     ) -> Tuple[int]:
         """
-        Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+        Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
         """
         _, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
         c = get_stacked_multiply(module_name)

sglang/srt/lora/triton_ops/gate_up_lora_b.py CHANGED Viewed

@@ -77,7 +77,7 @@ def _gate_up_lora_b_kernel(
         k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
     )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
     partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(

sglang/srt/lora/triton_ops/qkv_lora_b.py CHANGED Viewed

@@ -79,7 +79,7 @@ def _qkv_lora_b_kernel(
         k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
     )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
     partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(

sglang/srt/lora/triton_ops/sgemm_lora_a.py CHANGED Viewed

@@ -67,7 +67,7 @@ def _sgemm_lora_a_kernel(
         k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
     )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
     partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(

sglang/srt/lora/triton_ops/sgemm_lora_b.py CHANGED Viewed

@@ -69,7 +69,7 @@ def _sgemm_lora_b_kernel(
         k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
     )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
     partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(

sglang/srt/lora/utils.py CHANGED Viewed

@@ -79,7 +79,7 @@ def get_hidden_dim(
     module_name: str, config: AutoConfig, base_model: torch.nn.Module
 ) -> Tuple[int]:
     """
-    Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+    Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
     """
     if hasattr(base_model, "get_hidden_dim"):

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl