PyPI - sglang - Versions diffs - 0.4.5.post3__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl - Mend

sglang 0.4.5.post3py3-none-any.whl → 0.4.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

sglang/bench_one_batch.py +19 -3
sglang/bench_serving.py +8 -9
sglang/compile_deep_gemm.py +45 -4
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +1 -1
sglang/srt/configs/model_config.py +9 -3
sglang/srt/constrained/llguidance_backend.py +78 -61
sglang/srt/conversation.py +34 -1
sglang/srt/disaggregation/decode.py +67 -13
sglang/srt/disaggregation/fake/__init__.py +1 -0
sglang/srt/disaggregation/fake/conn.py +88 -0
sglang/srt/disaggregation/mini_lb.py +45 -8
sglang/srt/disaggregation/mooncake/conn.py +198 -31
sglang/srt/disaggregation/prefill.py +36 -12
sglang/srt/disaggregation/utils.py +16 -2
sglang/srt/entrypoints/engine.py +9 -0
sglang/srt/entrypoints/http_server.py +35 -4
sglang/srt/function_call_parser.py +77 -5
sglang/srt/layers/attention/base_attn_backend.py +3 -0
sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
sglang/srt/layers/attention/flashattention_backend.py +28 -10
sglang/srt/layers/attention/flashmla_backend.py +8 -11
sglang/srt/layers/attention/utils.py +1 -1
sglang/srt/layers/attention/vision.py +2 -0
sglang/srt/layers/layernorm.py +38 -16
sglang/srt/layers/logits_processor.py +2 -2
sglang/srt/layers/moe/fused_moe_native.py +2 -4
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +20 -17
sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
sglang/srt/layers/pooler.py +6 -0
sglang/srt/layers/quantization/awq.py +5 -1
sglang/srt/layers/quantization/deep_gemm.py +17 -10
sglang/srt/layers/quantization/fp8.py +20 -22
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/int8_kernel.py +32 -1
sglang/srt/layers/radix_attention.py +13 -3
sglang/srt/layers/rotary_embedding.py +170 -126
sglang/srt/managers/data_parallel_controller.py +10 -3
sglang/srt/managers/io_struct.py +7 -0
sglang/srt/managers/mm_utils.py +85 -28
sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
sglang/srt/managers/schedule_batch.py +38 -12
sglang/srt/managers/scheduler.py +41 -28
sglang/srt/managers/scheduler_output_processor_mixin.py +25 -9
sglang/srt/managers/tokenizer_manager.py +5 -1
sglang/srt/managers/tp_worker.py +3 -3
sglang/srt/managers/tp_worker_overlap_thread.py +9 -4
sglang/srt/mem_cache/memory_pool.py +87 -0
sglang/srt/model_executor/cuda_graph_runner.py +4 -3
sglang/srt/model_executor/forward_batch_info.py +51 -95
sglang/srt/model_executor/model_runner.py +19 -25
sglang/srt/models/deepseek.py +12 -2
sglang/srt/models/deepseek_nextn.py +101 -6
sglang/srt/models/deepseek_v2.py +144 -70
sglang/srt/models/deepseek_vl2.py +9 -4
sglang/srt/models/gemma3_causal.py +1 -1
sglang/srt/models/llama4.py +0 -1
sglang/srt/models/minicpmo.py +5 -1
sglang/srt/models/mllama4.py +2 -2
sglang/srt/models/qwen2_5_vl.py +3 -6
sglang/srt/models/qwen2_vl.py +3 -7
sglang/srt/models/roberta.py +178 -0
sglang/srt/openai_api/adapter.py +50 -11
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/reasoning_parser.py +25 -1
sglang/srt/server_args.py +31 -24
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/torch_memory_saver_adapter.py +10 -1
sglang/srt/utils.py +5 -1
sglang/test/runners.py +6 -13
sglang/test/send_one.py +84 -28
sglang/test/test_utils.py +74 -18
sglang/version.py +1 -1
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/METADATA +5 -6
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/RECORD +97 -80
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/WHEEL +1 -1
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/deep_gemm.py CHANGED Viewed

@@ -25,7 +25,7 @@ if is_cuda():
     sm_version = get_device_sm()
     if sm_version == 90:
-        if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="false"):
+        if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
             _ENABLE_JIT_DEEPGEMM = True
 logger = logging.getLogger(__name__)
@@ -34,9 +34,10 @@ _BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
 _ENABLE_JIT_DEEPGEMM_PRECOMPILE = get_bool_env_var(
     "SGL_JIT_DEEPGEMM_PRECOMPILE", "true"
 )
-_DO_COMPILE = get_bool_env_var("SGL_IS_FIRST_RANK_ON_NODE", "true")
+_DO_COMPILE_ALL = True
+_IS_FIRST_RANK_ON_NODE = get_bool_env_var("SGL_IS_FIRST_RANK_ON_NODE", "true")
 _COMPILE_WORKERS = get_int_env_var("SGL_JIT_DEEPGEMM_COMPILE_WORKERS", 4)
-_IN_PRE_COMPILE_STAGE = get_bool_env_var("SGL_IN_DEEP_GEMM_PRE_COMPILE_STAGE", "false")
+_IN_PRECOMPILE_STAGE = get_bool_env_var("SGL_IN_DEEPGEMM_PRECOMPILE_STAGE", "false")
 # Force redirect deep_gemm cache_dir
 os.environ["DG_CACHE_DIR"] = os.getenv(
@@ -46,7 +47,8 @@ os.environ["DG_CACHE_DIR"] = os.getenv(
 def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
     global _BUILTIN_M_LIST
-    global _DO_COMPILE
+    global _DO_COMPILE_ALL
+    global _IS_FIRST_RANK_ON_NODE
     # Generate m_max
     m_max = 1024 * 16
@@ -57,8 +59,13 @@ def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
     m_max = min(1024 * 128, m_max)
     _BUILTIN_M_LIST = list(range(1, m_max + 1))
-    # Check if is the first rank on node
-    _DO_COMPILE = ServerArgs.base_gpu_id == gpu_id
+    _IS_FIRST_RANK_ON_NODE = ServerArgs.base_gpu_id == gpu_id
+    # Check if is the first rank on node.
+    # Default each rank will try compile all Ms to
+    # load all symbols at the launch stages.
+    # Avoid loading symbols at the serving stages.
+    _DO_COMPILE_ALL = _IS_FIRST_RANK_ON_NODE or not _IN_PRECOMPILE_STAGE
 class DeepGemmKernelType(IntEnum):
@@ -89,7 +96,7 @@ _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dic
 def _compile_warning_1():
-    if not _IN_PRE_COMPILE_STAGE:
+    if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
         logger.warning(
             "Entering DeepGEMM JIT Pre-Complie session. "
             "And it may takes a long time(Typically 10-20 mins) "
@@ -276,7 +283,7 @@ def _maybe_compile_deep_gemm_one_type_all(
     query_key = (kernel_type, n, k, num_groups)
     if (
         _ENABLE_JIT_DEEPGEMM_PRECOMPILE
-        and _DO_COMPILE
+        and _DO_COMPILE_ALL
         and _INITIALIZATION_DICT.get(query_key) is None
     ):
         _INITIALIZATION_DICT[query_key] = True
@@ -286,7 +293,7 @@ def _maybe_compile_deep_gemm_one_type_all(
         logger.info(
             f"Try DeepGEMM JIT Compiling for "
             f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
-            f"{' It only takes a litte time(Typically 1 sec) if you have run `sglang.compile_deep_gemm`. ' if not _IN_PRE_COMPILE_STAGE else ''}"
+            f"{' It only takes a litte time(Typically 1 sec) if you have run `sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
         )
         # NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
@@ -355,7 +362,7 @@ def gemm_nt_f8f8bf16(
 @contextmanager
 def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
-    if _IN_PRE_COMPILE_STAGE:
+    if _IN_PRECOMPILE_STAGE:
         yield
         return

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -72,8 +72,8 @@ _is_hip = is_hip()
 _is_cuda = is_cuda()
 if _is_hip:
-    from aiter import ActivationType
-    from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages, ck_moe_2stages_win4
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages
     from aiter.ops.shuffle import shuffle_weight
 if not _is_cuda:
@@ -484,7 +484,7 @@ class Fp8MoEMethod:
         if self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = (
                 torch.uint32
-                if get_bool_env_var("USE_INT4_WEIGHT")
+                if get_bool_env_var("SGLANG_INT4_WEIGHT")
                 else torch.float8_e4m3fn
             )
         tp_size = get_tensor_model_parallel_world_size()
@@ -511,7 +511,7 @@ class Fp8MoEMethod:
                     )
         # WEIGHTS
-        if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+        if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
             # INT4 MoE weight - INT32 packed
             w13_weight = torch.nn.Parameter(
                 torch.empty(
@@ -585,7 +585,7 @@ class Fp8MoEMethod:
             if (
                 _is_hip
-            ):  # and get_bool_env_var("CK_MOE"): TODO: add check back after triton kernel
+            ):  # and get_bool_env_var("SGLANG_AITER_MOE"): TODO: add check back after triton kernel
                 # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
                 w13_weight_scale1 = torch.nn.Parameter(
                     torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
@@ -612,7 +612,7 @@ class Fp8MoEMethod:
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-            if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+            if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
                 extra_weight_attrs.update(
                     {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
                 )
@@ -644,7 +644,7 @@ class Fp8MoEMethod:
             layer.w2_input_scale = None
     def process_weights_after_loading(self, layer: Module) -> None:
-        if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+        if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
             self.process_weights_hip_int4(layer)
             return
@@ -675,7 +675,7 @@ class Fp8MoEMethod:
                 )
                 layer.w2_input_scale = None
-                if get_bool_env_var("CK_MOE"):
+                if get_bool_env_var("SGLANG_AITER_MOE"):
                     # Pre-shuffle weights
                     layer.w13_weight.data = shuffle_weight(
                         layer.w13_weight.contiguous(), (16, 16)
@@ -798,17 +798,15 @@ class Fp8MoEMethod:
             return
     def process_weights_hip_int4(self, layer: Module):
-        # TODO: and get_bool_env_var("CK_MOE"): add after triton kernel added
+        # TODO: and get_bool_env_var("SGLANG_AITER_MOE"): add after triton kernel added
         # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
         # Weight Permutation
         layer.w13_weight = torch.nn.Parameter(
-            # permute_weight(layer.w13_weight.data),
             shuffle_weight(layer.w13_weight.data, (16, 16)),
             requires_grad=False,
         )
         torch.cuda.empty_cache()
         layer.w2_weight = torch.nn.Parameter(
-            # permute_weight(layer.w2_weight.data),
             shuffle_weight(layer.w2_weight.data, (16, 16)),
             requires_grad=False,
         )
@@ -847,23 +845,21 @@ class Fp8MoEMethod:
             padding_size,  # Avoid circular import
         )
-        if get_bool_env_var("CK_MOE"):
+        if get_bool_env_var("SGLANG_AITER_MOE"):
             layer.w13_weight = torch.nn.Parameter(
-                # permute_weight(layer.w13_weight.data),
                 shuffle_weight(layer.w13_weight.data, (16, 16)),
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
             layer.w2_weight = torch.nn.Parameter(
-                # permute_weight(layer.w2_weight.data),
                 shuffle_weight(layer.w2_weight.data, (16, 16)),
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
-            # ROCm (CK_MOE): using column-wise scaling
+            # ROCm (SGLANG_AITER_MOE): using column-wise scaling
             layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
             layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
-        elif get_bool_env_var("MOE_PADDING"):
+        elif get_bool_env_var("SGLANG_MOE_PADDING"):
             # If ROCm, apply weight padding (min. Mem channel contention) only if set
             layer.w13_weight = torch.nn.Parameter(
                 F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
@@ -912,15 +908,16 @@ class Fp8MoEMethod:
         )
         if _is_hip:
-            if get_bool_env_var("USE_INT4_WEIGHT"):
-                # TODO: add triton kernel and add check get_bool_env_var("CK_MOE")
+            if get_bool_env_var("SGLANG_INT4_WEIGHT"):
+                # TODO: add triton kernel and add check get_bool_env_var("SGLANG_AITER_MOE")
                 assert not no_combine, f"{no_combine=} is not supported."
-                return ck_moe_2stages_win4(
+                return ck_moe_2stages(
                     x,
                     layer.w13_weight,
                     layer.w2_weight,
                     topk_weights,
                     topk_ids,
+                    QuantType.per_Token,
                     layer.w13_weight_scale1,
                     layer.w2_weight_scale1,
                     activation=(
@@ -930,13 +927,13 @@ class Fp8MoEMethod:
                     ),
                 )
-            if get_bool_env_var("CK_MOE"):
+            if get_bool_env_var("SGLANG_AITER_MOE"):
                 assert not no_combine, f"{no_combine=} is not supported."
                 if self.block_quant:
-                    # TODO(CK_MOE): FP8 block_quant only supports 'silu' for the time-being.
+                    # TODO(SGLANG_AITER_MOE): FP8 block_quant only supports 'silu' for the time-being.
                     assert (
                         activation == "silu"
-                    ), f"CK_MOE: FP8 bloack_quant {activation=} will be supported later, unset CK_MOE"
+                    ), f"SGLANG_AITER_MOE: FP8 bloack_quant {activation=} will be supported later, unset SGLANG_AITER_MOE"
                     return asm_moe(
                         x,
                         layer.w13_weight,
@@ -955,6 +952,7 @@ class Fp8MoEMethod:
                         layer.w2_weight,
                         topk_weights,
                         topk_ids,
+                        QuantType.per_Token,
                         layer.w13_weight_scale1,
                         layer.w2_weight_scale1,
                         activation=(

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -31,7 +31,7 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_cuda = is_cuda()
-if _is_hip and get_bool_env_var("CK_MOE"):
+if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
     from aiter import gemm_a8w8_blockscale
 if _is_cuda:
@@ -132,7 +132,7 @@ def apply_w8a8_block_fp8_linear(
         output = fp8_blockwise_scaled_mm(
             q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
         )
-    elif _is_hip and get_bool_env_var("CK_MOE"):
+    elif _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=False
         )

sglang/srt/layers/quantization/int8_kernel.py CHANGED Viewed

@@ -8,7 +8,11 @@ import torch
 import triton
 import triton.language as tl
-from sglang.srt.utils import get_device_name
+from sglang.srt.utils import get_device_name, is_cuda
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import sgl_per_token_group_quant_int8
 logger = logging.getLogger(__name__)
@@ -165,6 +169,33 @@ def per_token_group_quant_int8(
     return x_q, x_s
+def sglang_per_token_group_quant_int8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = torch.int8,
+):
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+    iinfo = torch.iinfo(dtype)
+    int8_max = iinfo.max
+    int8_min = iinfo.min
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size,),
+        device=x.device,
+        dtype=torch.float32,
+    )
+    sgl_per_token_group_quant_int8(x, x_q, x_s, group_size, eps, int8_min, int8_max)
+    return x_q, x_s
 @triton.jit
 def _w8a8_block_int8_matmul(
     # Pointers to inputs and output

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -87,13 +87,23 @@ class RadixAttention(nn.Module):
         v,
         forward_batch: ForwardBatch,
         save_kv_cache: bool = True,
+        **kwargs,
     ):
         if k is not None:
             # For cross-layer sharing, kv can be None
             assert v is not None
-            k = k.view(-1, self.tp_k_head_num, self.qk_head_dim)
-            v = v.view(-1, self.tp_v_head_num, self.v_head_dim)
+            if "k_rope" not in kwargs:
+                k = k.view(-1, self.tp_k_head_num, self.qk_head_dim)
+                v = v.view(-1, self.tp_v_head_num, self.v_head_dim)
+            else:
+                k = k.view(-1, self.tp_k_head_num, self.v_head_dim)
         return forward_batch.attn_backend.forward(
-            q, k, v, self, forward_batch, save_kv_cache
+            q,
+            k,
+            v,
+            self,
+            forward_batch,
+            save_kv_cache,
+            **kwargs,
         )

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -14,8 +14,6 @@ _is_cuda = is_cuda()
 if _is_cuda:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
-else:
-    from vllm._custom_ops import rotary_embedding as vllm_rotary_embedding
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -84,6 +82,12 @@ class RotaryEmbedding(CustomOp):
         # NOTE(ByronHsu): cache needs to be in FP32 for numerical stability
         if not _is_cuda:
             cache = cache.to(dtype)
+        if not _is_cuda or self.head_size not in [64, 128, 256, 512]:
+            from vllm._custom_ops import rotary_embedding
+            self.vllm_rotary_embedding = rotary_embedding
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
@@ -160,7 +164,7 @@ class RotaryEmbedding(CustomOp):
             )
         else:
             self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
-            vllm_rotary_embedding(
+            self.vllm_rotary_embedding(
                 positions,
                 query,
                 key,
@@ -665,6 +669,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
+        dtype = query.dtype
         query_rot = query[..., : self.rotary_dim]
         key_rot = key[..., : self.rotary_dim]
         if self.rotary_dim < self.head_size:
@@ -695,7 +700,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         else:
             query = query_rot
             key = key_rot
-        return query, key
+        return query.to(dtype), key.to(dtype)
 class Llama3RotaryEmbedding(RotaryEmbedding):
@@ -876,142 +881,181 @@ class MRotaryEmbedding(RotaryEmbedding):
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
+    # Copied from https://github.com/huggingface/transformers/blob/c8e0e603de9b3d49161a15fe6e8ea84badfb5d02/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1439
     @staticmethod
-    def get_input_positions(
-        input_tokens: List[int],
-        image_grid_thw: Union[List[List[int]], torch.Tensor],
-        video_grid_thw: Union[List[List[int]], torch.Tensor],
+    def get_rope_index(
+        spatial_merge_size: int,
         image_token_id: int,
         video_token_id: int,
         vision_start_token_id: int,
-        vision_end_token_id: int,
-        spatial_merge_size: int,
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        second_per_grid_ts: Optional[torch.Tensor] = None,
+        model_type: str,
         tokens_per_second: Optional[int] = None,
-    ) -> Tuple[List[List[int]], int]:
-        """
-        Get mrope input positions and delta value.
-        :arg
-            second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
-                The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
-        """
-        if isinstance(image_grid_thw, torch.Tensor):
-            image_grid_thw = image_grid_thw.tolist()
-        if isinstance(video_grid_thw, torch.Tensor):
-            video_grid_thw = video_grid_thw.tolist()
-        input_tokens_tensor = torch.tensor(input_tokens)
-        vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id
-        ).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
-        llm_pos_ids_list: list = []
-        st = 0
-        remain_images, remain_videos = image_nums, video_nums
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
-            else:
-                ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                second_per_grid_t = 0
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                if second_per_grid_ts is not None:
-                    second_per_grid_t = second_per_grid_ts[video_index]
-                else:
-                    second_per_grid_t = 1.0
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
-            llm_grid_t, llm_grid_h, llm_grid_w = (
-                t,
-                h // spatial_merge_size,
-                w // spatial_merge_size,
-            )
-            text_len = ed - st
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-            )
-            t_index = (
-                torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
-                * second_per_grid_t
-                * tokens_per_second
-            ).flatten()
-            h_index = (
-                torch.arange(llm_grid_h)
-                .view(1, -1, 1)
-                .expand(llm_grid_t, -1, llm_grid_w)
-                .flatten()
-            )
-            w_index = (
-                torch.arange(llm_grid_w)
-                .view(1, 1, -1)
-                .expand(llm_grid_t, llm_grid_h, -1)
-                .flatten()
-            )
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        mrope_position_deltas = []
+        if input_ids is not None and (
+            image_grid_thw is not None or video_grid_thw is not None
+        ):
+            total_input_ids = input_ids
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
             )
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-        if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            image_index, video_index = 0, 0
+            for i, input_ids in enumerate(total_input_ids):
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(
+                    input_ids == vision_start_token_id
+                ).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        second_per_grid_t = 0
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    if model_type == "qwen2_5_vl":
+                        range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                        expanded_range = range_tensor.expand(
+                            -1, llm_grid_h * llm_grid_w
+                        )
+                        time_tensor = (
+                            expanded_range * second_per_grid_t * tokens_per_second
+                        )
+                        time_tensor_long = time_tensor.long()
+                        t_index = time_tensor_long.flatten()
+                    elif model_type == "qwen2_vl":
+                        t_index = (
+                            torch.arange(llm_grid_t)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                    else:
+                        raise RuntimeError("Unimplemented")
+                    h_index = (
+                        torch.arange(llm_grid_h)
+                        .view(1, -1, 1)
+                        .expand(llm_grid_t, -1, llm_grid_w)
+                        .flatten()
+                    )
+                    w_index = (
+                        torch.arange(llm_grid_w)
+                        .view(1, 1, -1)
+                        .expand(llm_grid_t, llm_grid_h, -1)
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+                    )
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+                if st < len(input_tokens):
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, :] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(
+                    llm_positions.max() + 1 - len(total_input_ids[i])
+                )
+            mrope_position_deltas = torch.tensor(
+                mrope_position_deltas, device=input_ids.device
+            ).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            s = input_ids.shape[1]
+            position_ids = torch.arange(s)
+            position_ids = (
+                position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
             )
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
-        return llm_positions.tolist(), mrope_position_delta
+            max_position_ids = position_ids.max(0, keepdim=False)[0].max(
+                -1, keepdim=True
+            )[0]
+            mrope_position_deltas = max_position_ids + 1 - s
+            return position_ids, mrope_position_deltas
     @staticmethod
     def get_next_input_positions(
         mrope_position_delta: int,
         context_len: int,
         seq_len: int,
-    ) -> List[List[int]]:
-        return [
-            list(
-                range(
-                    context_len + mrope_position_delta, seq_len + mrope_position_delta
+    ) -> torch.Tensor:
+        return torch.tensor(
+            [
+                list(
+                    range(
+                        context_len + mrope_position_delta,
+                        seq_len + mrope_position_delta,
+                    )
                 )
-            )
-            for _ in range(3)
-        ]
+                for _ in range(3)
+            ]
+        )
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}

sglang 0.4.5.post3__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl

sglang 0.4.5.post3py3-none-any.whl → 0.4.6.post1py3-none-any.whl