PyPI - sglang - Versions diffs - 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl - Mend

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

sglang/__init__.py +2 -0
sglang/api.py +6 -0
sglang/bench_one_batch.py +1 -1
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +26 -4
sglang/check_env.py +3 -4
sglang/lang/backend/openai.py +18 -5
sglang/lang/chat_template.py +28 -7
sglang/lang/interpreter.py +7 -3
sglang/lang/ir.py +10 -0
sglang/srt/_custom_ops.py +1 -1
sglang/srt/code_completion_parser.py +174 -0
sglang/srt/configs/__init__.py +2 -6
sglang/srt/configs/deepseekvl2.py +676 -0
sglang/srt/configs/janus_pro.py +3 -4
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +49 -8
sglang/srt/configs/utils.py +25 -0
sglang/srt/connector/__init__.py +51 -0
sglang/srt/connector/base_connector.py +112 -0
sglang/srt/connector/redis.py +85 -0
sglang/srt/connector/s3.py +122 -0
sglang/srt/connector/serde/__init__.py +31 -0
sglang/srt/connector/serde/safe_serde.py +29 -0
sglang/srt/connector/serde/serde.py +43 -0
sglang/srt/connector/utils.py +35 -0
sglang/srt/conversation.py +88 -0
sglang/srt/disaggregation/conn.py +81 -0
sglang/srt/disaggregation/decode.py +495 -0
sglang/srt/disaggregation/mini_lb.py +285 -0
sglang/srt/disaggregation/prefill.py +249 -0
sglang/srt/disaggregation/utils.py +44 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/parallel_state.py +42 -8
sglang/srt/entrypoints/engine.py +55 -5
sglang/srt/entrypoints/http_server.py +78 -13
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +133 -55
sglang/srt/hf_transformers_utils.py +28 -3
sglang/srt/layers/activation.py +4 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +434 -0
sglang/srt/layers/attention/flashinfer_backend.py +1 -1
sglang/srt/layers/attention/flashmla_backend.py +284 -0
sglang/srt/layers/attention/triton_backend.py +171 -38
sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
sglang/srt/layers/attention/utils.py +53 -0
sglang/srt/layers/attention/vision.py +9 -28
sglang/srt/layers/dp_attention.py +41 -19
sglang/srt/layers/layernorm.py +24 -2
sglang/srt/layers/linear.py +17 -5
sglang/srt/layers/logits_processor.py +25 -7
sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
sglang/srt/layers/moe/ep_moe/layer.py +273 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
sglang/srt/layers/moe/topk.py +60 -20
sglang/srt/layers/parameter.py +1 -1
sglang/srt/layers/quantization/__init__.py +80 -53
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/base_config.py +5 -0
sglang/srt/layers/quantization/blockwise_int8.py +1 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
sglang/srt/layers/quantization/fp8.py +76 -34
sglang/srt/layers/quantization/fp8_kernel.py +25 -8
sglang/srt/layers/quantization/fp8_utils.py +284 -28
sglang/srt/layers/quantization/gptq.py +36 -19
sglang/srt/layers/quantization/kv_cache.py +98 -0
sglang/srt/layers/quantization/modelopt_quant.py +9 -7
sglang/srt/layers/quantization/utils.py +153 -0
sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
sglang/srt/layers/rotary_embedding.py +78 -87
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +87 -33
sglang/srt/lora/lora.py +2 -22
sglang/srt/lora/lora_manager.py +67 -30
sglang/srt/lora/mem_pool.py +117 -52
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +18 -1
sglang/srt/managers/cache_controller.py +2 -5
sglang/srt/managers/data_parallel_controller.py +30 -8
sglang/srt/managers/expert_distribution.py +81 -0
sglang/srt/managers/io_struct.py +43 -5
sglang/srt/managers/mm_utils.py +373 -0
sglang/srt/managers/multimodal_processor.py +68 -0
sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
sglang/srt/managers/multimodal_processors/clip.py +63 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
sglang/srt/managers/schedule_batch.py +134 -30
sglang/srt/managers/scheduler.py +290 -31
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +59 -24
sglang/srt/managers/tp_worker.py +4 -1
sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
sglang/srt/managers/utils.py +6 -1
sglang/srt/mem_cache/hiradix_cache.py +18 -7
sglang/srt/mem_cache/memory_pool.py +255 -98
sglang/srt/mem_cache/paged_allocator.py +2 -2
sglang/srt/mem_cache/radix_cache.py +4 -4
sglang/srt/model_executor/cuda_graph_runner.py +36 -21
sglang/srt/model_executor/forward_batch_info.py +68 -11
sglang/srt/model_executor/model_runner.py +75 -8
sglang/srt/model_loader/loader.py +171 -3
sglang/srt/model_loader/weight_utils.py +51 -3
sglang/srt/models/clip.py +563 -0
sglang/srt/models/deepseek_janus_pro.py +31 -88
sglang/srt/models/deepseek_nextn.py +22 -10
sglang/srt/models/deepseek_v2.py +329 -73
sglang/srt/models/deepseek_vl2.py +358 -0
sglang/srt/models/gemma3_causal.py +694 -0
sglang/srt/models/gemma3_mm.py +468 -0
sglang/srt/models/llama.py +47 -7
sglang/srt/models/llama_eagle.py +1 -0
sglang/srt/models/llama_eagle3.py +196 -0
sglang/srt/models/llava.py +3 -3
sglang/srt/models/llavavid.py +3 -3
sglang/srt/models/minicpmo.py +1995 -0
sglang/srt/models/minicpmv.py +62 -137
sglang/srt/models/mllama.py +4 -4
sglang/srt/models/phi3_small.py +1 -1
sglang/srt/models/qwen2.py +3 -0
sglang/srt/models/qwen2_5_vl.py +68 -146
sglang/srt/models/qwen2_classification.py +75 -0
sglang/srt/models/qwen2_moe.py +9 -1
sglang/srt/models/qwen2_vl.py +25 -63
sglang/srt/openai_api/adapter.py +201 -104
sglang/srt/openai_api/protocol.py +33 -7
sglang/srt/patch_torch.py +71 -0
sglang/srt/sampling/sampling_batch_info.py +1 -1
sglang/srt/sampling/sampling_params.py +6 -6
sglang/srt/server_args.py +114 -14
sglang/srt/speculative/build_eagle_tree.py +7 -347
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
sglang/srt/speculative/eagle_utils.py +208 -252
sglang/srt/speculative/eagle_worker.py +140 -54
sglang/srt/speculative/spec_info.py +6 -1
sglang/srt/torch_memory_saver_adapter.py +22 -0
sglang/srt/utils.py +215 -21
sglang/test/__init__.py +0 -0
sglang/test/attention/__init__.py +0 -0
sglang/test/attention/test_flashattn_backend.py +312 -0
sglang/test/runners.py +29 -2
sglang/test/test_activation.py +2 -1
sglang/test/test_block_fp8.py +5 -4
sglang/test/test_block_fp8_ep.py +2 -1
sglang/test/test_dynamic_grad_mode.py +58 -0
sglang/test/test_layernorm.py +3 -2
sglang/test/test_utils.py +56 -5
sglang/utils.py +31 -0
sglang/version.py +1 -1
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +16 -8
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +180 -132
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +1 -1
sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
sglang/srt/managers/image_processor.py +0 -55
sglang/srt/managers/image_processors/base_image_processor.py +0 -219
sglang/srt/managers/image_processors/minicpmv.py +0 -86
sglang/srt/managers/multi_modality_padding.py +0 -134
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info/licenses}/LICENSE +0 -0
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/utils.py ADDED Viewed

@@ -0,0 +1,153 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/quant_utils.py
+from types import MappingProxyType
+from typing import List, Mapping, Tuple, Union
+import torch
+from sglang.srt.utils import is_cuda
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sglang.srt.custom_op import scaled_fp8_quant as sgl_scaled_fp8_quant
+else:
+    from vllm import _custom_ops as vllm_ops
+def is_fp8_fnuz() -> bool:
+    # only device 0 is checked, this assumes MI300 platforms are homogeneous
+    return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
+def is_layer_skipped(
+    prefix: str,
+    ignored_layers: List[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({}),
+) -> bool:
+    # prefix: model.layers.0.self_attn.q_proj
+    # proj_name: q_proj
+    proj_name = prefix.split(".")[-1]
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping:
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in fused_mapping[proj_name]
+        ]
+        is_skipped = None
+        for shard_prefix in shard_prefixes:
+            is_shard_skipped = shard_prefix in ignored_layers
+            if is_skipped is None:
+                is_skipped = is_shard_skipped
+            elif is_shard_skipped != is_skipped:
+                raise ValueError(
+                    f"Detected some but not all shards of {prefix} "
+                    "are quantized. All shards of fused layers "
+                    "to have the same precision."
+                )
+    else:
+        is_skipped = prefix in ignored_layers
+    assert is_skipped is not None
+    return is_skipped
+def per_tensor_dequantize(
+    tensor: torch.Tensor, inv_scale: Union[float, torch.Tensor]
+) -> torch.Tensor:
+    fake_qweight = tensor.to(torch.float16)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight
+def all_close_1d(x: torch.Tensor) -> bool:
+    assert len(x.shape) == 1
+    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
+def convert_to_channelwise(
+    weight_scale: torch.Tensor, logical_widths: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Create channelwise buffer
+    weight_scale_channel = torch.empty(
+        (sum(logical_widths), 1), dtype=torch.float32, device=weight_scale.device
+    )
+    # Handle scalar tensor case: broadcast same scale to all channels
+    if weight_scale.dim() == 0:
+        weight_scale_channel.fill_(weight_scale.item())
+        return weight_scale_channel
+    # Expand each scale to match the size of each logical matrix.
+    start = 0
+    for idx, logical_width in enumerate(logical_widths):
+        end = start + logical_width
+        weight_scale_channel[start:end, :] = weight_scale[idx]
+        start = end
+    return weight_scale_channel
+def requantize_with_max_scale(
+    weight: torch.Tensor, weight_scale: torch.Tensor, logical_widths: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Max scale to be used for requanitzation.
+    max_w_scale = weight_scale.max()
+    # QKV / MLP is fused in the on disk checkpoint if any of the
+    # weight scales are still set to the default since we initialize
+    # N weight scales for N shards but we only load 1 weight scale
+    # from disk in this case. Skip requantization in this case (since)
+    # we already are quantized with the single scale.
+    # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
+    unfused_module_in_checkpoint = (
+        weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min
+    )
+    # If unfused checkpoint, need requanize with the single scale.
+    if unfused_module_in_checkpoint:
+        start = 0
+        for idx, logical_width in enumerate(logical_widths):
+            end = start + logical_width
+            weight_dq = per_tensor_dequantize(weight[start:end, :], weight_scale[idx])
+            if _is_cuda:
+                weight[start:end, :], _ = sgl_scaled_fp8_quant(weight_dq, max_w_scale)
+            else:
+                weight[start:end, :], _ = vllm_ops.scaled_fp8_quant(
+                    weight_dq, max_w_scale
+                )
+            start = end
+    return max_w_scale, weight
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/layer_utils.py
+# Newly generated tensors need to replace existing tensors that are
+# already registered as parameters by vLLM (and won't be freed)
+def replace_parameter(
+    mod: torch.nn.Module, name: str, new: Union[torch.Tensor, torch.nn.Parameter]
+) -> None:
+    old = getattr(mod, name)
+    if (
+        type(old) is type(new)
+        and old.dtype == new.dtype
+        and old.untyped_storage().nbytes() == new.untyped_storage().nbytes()
+    ):
+        # If we can just update in-place to avoid re-registering
+        #   can be faster if the underlying storage is the same
+        update_tensor_inplace(old, new)
+    else:
+        # Fallback re-register parameter, convert to Parameter if necessary
+        # this not only ensures we don't register a tensor as a parameter, but
+        # also ensures that all parameter subclasses get re-registered as
+        # parameters for `torch.compile` compatibility
+        if not isinstance(new, torch.nn.Parameter):
+            new = torch.nn.Parameter(new, requires_grad=False)
+        mod.register_parameter(name, torch.nn.Parameter(new, requires_grad=False))

sglang/srt/layers/quantization/w8a8_fp8.py CHANGED Viewed

@@ -9,9 +9,11 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
 from sglang.srt.layers.quantization.fp8_utils import (
     apply_fp8_linear,
     cutlass_fp8_supported,
+    input_to_float8,
     normalize_e4m3fn_to_e4m3fnuz,
 )
 from sglang.srt.utils import is_hip
@@ -22,12 +24,24 @@ _is_hip = is_hip()
 class W8A8Fp8Config(QuantizationConfig):
     """Config class for W8A8 FP8 Quantization.
-    - Weight: static, per-channel, symmetric
-    - Activation: dynamic, per-token, symmetric
+    Weight Quantization:
+    - Method: Static quantization
+    - Granularity: Per-channel
+    - Type: Symmetric
+    Activation Quantization:
+    - Method: Dynamic quantization
+    - Granularity: Per-token
+    - Type: Symmetric
+    Note:
+    - For models without offline quantization, weights will be quantized during model loading
+    - If CUTLASS is supported: Per-channel weight quantization is used
+    - If CUTLASS is not supported: Falls back to per-tensor weight quantization
     """
-    def __init__(self):
-        pass
+    def __init__(self, is_checkpoint_fp8_serialized: bool = False):
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
     @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
@@ -47,7 +61,9 @@ class W8A8Fp8Config(QuantizationConfig):
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "W8A8Fp8Config":
-        return cls()
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = "compressed-tensors" in quant_method
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized)
     def get_quant_method(
         self,
@@ -72,13 +88,40 @@ class W8A8Fp8LinearMethod(LinearMethodBase):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         weight = layer.weight
-        weight_scale = layer.weight_scale.detach()
-        if _is_hip:
-            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
-                weight=weight, weight_scale=weight_scale
-            )
-        layer.weight = Parameter(weight.t(), requires_grad=False)
-        layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        if self.quantization_config.is_checkpoint_fp8_serialized:
+            weight_scale = layer.weight_scale.detach()
+            # If checkpoint offline quantized with w8a8_fp8, load the weight and weight_scale directly.
+            if _is_hip:
+                weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight, weight_scale=weight_scale
+                )
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        else:
+            # If checkpoint not offline quantized, quantize the weights with per-channel quantization.
+            if self.cutlass_fp8_supported:
+                # if cutlass supported, we use cutlass_scaled_mm
+                # which requires per-channel quantization on weight
+                qweight, weight_scale = per_token_group_quant_fp8(
+                    layer.weight, layer.weight.shape[-1]
+                )
+                weight_scale = weight_scale.t().contiguous()
+                if _is_hip:
+                    weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                        weight=weight, weight_scale=weight_scale
+                    )
+            else:
+                # if cutlass not supported, we fall back to use torch._scaled_mm
+                # which requires per tensor quantization on weight
+                fp8_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+                qweight, weight_scale = input_to_float8(layer.weight, dtype=fp8_dtype)
+            # Update the layer with the new values.
+            layer.weight = Parameter(qweight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            layer.input_scale = None
     def create_weights(
         self,
@@ -90,6 +133,11 @@ class W8A8Fp8LinearMethod(LinearMethodBase):
         params_dtype: torch.dtype,
         **extra_weight_attrs
     ):
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quantization_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
         weight_loader = extra_weight_attrs.get("weight_loader")
         self.logical_widths = output_partition_sizes
@@ -98,7 +146,7 @@ class W8A8Fp8LinearMethod(LinearMethodBase):
             data=torch.empty(
                 sum(output_partition_sizes),
                 input_size_per_partition,
-                dtype=torch.float8_e4m3fn,
+                dtype=weight_dtype,
             ),
             input_dim=1,
             output_dim=0,
@@ -106,12 +154,15 @@ class W8A8Fp8LinearMethod(LinearMethodBase):
         )
         layer.register_parameter("weight", weight)
-        weight_scale = ChannelQuantScaleParameter(
-            data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight_scale", weight_scale)
+        if self.quantization_config.is_checkpoint_fp8_serialized:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("weight_scale", weight_scale)
+        else:
+            layer.weight_scale = None
     def apply(
         self,

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -6,7 +6,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from vllm import _custom_ops as ops
 from sglang.srt.custom_op import CustomOp
 from sglang.srt.utils import is_cuda_available
@@ -14,6 +13,8 @@ from sglang.srt.utils import is_cuda_available
 _is_cuda_available = is_cuda_available()
 if _is_cuda_available:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
+else:
+    from vllm import _custom_ops as ops
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -147,7 +148,7 @@ class RotaryEmbedding(CustomOp):
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if _is_cuda_available:
+        if _is_cuda_available and (self.head_size in [64, 128, 256, 512]):
             apply_rope_with_cos_sin_cache_inplace(
                 positions=positions,
                 query=query,
@@ -168,76 +169,6 @@ class RotaryEmbedding(CustomOp):
             )
         return query, key
-    def forward_xpu(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        from vllm._ipex_ops import ipex_ops as ops
-        self.cos_sin_cache = self.cos_sin_cache.to(positions.device, dtype=query.dtype)
-        ops.rotary_embedding(
-            positions,
-            query,
-            key,
-            self.head_size,
-            self.cos_sin_cache,
-            self.is_neox_style,
-        )
-        return query, key
-    def forward_hpu(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        from habana_frameworks.torch.hpex.kernels import (
-            RotaryPosEmbeddingMode,
-            apply_rotary_pos_emb,
-        )
-        positions = positions.flatten()
-        if offsets is not None:
-            positions = positions + offsets
-        num_tokens = positions.shape[0]
-        cos_sin = self.cos_sin_cache.index_select(0, positions).view(num_tokens, 1, -1)
-        cos, sin = cos_sin.chunk(2, dim=-1)
-        # HPU RoPE kernel requires hidden dimension for cos and sin to be equal
-        # to query hidden dimension, so the original tensors need to be
-        # expanded
-        # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
-        # and expansion of cos/sin tensors via concatenation
-        # GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE
-        # and expansion of cos/sin tensors via repeat_interleave
-        rope_mode: RotaryPosEmbeddingMode
-        if self.is_neox_style:
-            rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
-            cos = torch.cat((cos, cos), dim=-1)
-            sin = torch.cat((sin, sin), dim=-1)
-        else:
-            rope_mode = RotaryPosEmbeddingMode.PAIRWISE
-            sin = torch.repeat_interleave(sin, 2, dim=-1, output_size=cos_sin.shape[-1])
-            cos = torch.repeat_interleave(cos, 2, dim=-1, output_size=cos_sin.shape[-1])
-        query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        query_rot = query[..., : self.rotary_dim]
-        query_pass = query[..., self.rotary_dim :]
-        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, rope_mode)
-        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
-        key_shape = key.shape
-        key = key.view(num_tokens, -1, self.head_size)
-        key_rot = key[..., : self.rotary_dim]
-        key_pass = key[..., self.rotary_dim :]
-        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
-        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
-        return query, key
     def extra_repr(self) -> str:
         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
         s += f", max_position_embeddings={self.max_position_embeddings}"
@@ -510,16 +441,12 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
     ):
         super().__init__()
-        if rotary_dim != head_size:
-            raise ValueError(
-                f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \
-                    rotary_dim != head_size ({rotary_dim}!={head_size})."
-            )
         if is_neox_style is False:
             raise ValueError(
                 "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
             )
+        self.rotary_dim = rotary_dim
         self.head_size = head_size
         self.max_position_embeddings = max_position_embeddings
         self.original_max_position_embeddings = original_max_position_embeddings
@@ -568,8 +495,8 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
             * (
                 self.base
                 ** (
-                    torch.arange(0, self.head_size, 2, dtype=torch.float)
-                    / self.head_size
+                    torch.arange(0, self.rotary_dim, 2, dtype=torch.float)
+                    / self.rotary_dim
                 )
             )
         )
@@ -618,8 +545,15 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         cos = cos.repeat(1, 2).unsqueeze(-2)
         sin = sin.repeat(1, 2).unsqueeze(-2)
-        query = query * cos + _rotate_neox(query) * sin
-        key = key * cos + _rotate_neox(key) * sin
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = query_rot * cos + _rotate_neox(query_rot) * sin
+        query = torch.cat((query_rot, query_pass), dim=-1)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = key_rot * cos + _rotate_neox(key_rot) * sin
+        key = torch.cat((key_rot, key_pass), dim=-1)
         return query.flatten(-2), key.flatten(-2)
@@ -717,6 +651,18 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         query: torch.Tensor,
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if _is_cuda_available:
+            return self.forward_cuda(positions, query, key, offsets)
+        else:
+            return self.forward_native(positions, query, key, offsets)
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
         query_rot = query[..., : self.rotary_dim]
@@ -879,8 +825,17 @@ class MRotaryEmbedding(RotaryEmbedding):
         spatial_merge_size: int,
         context_len: int = 0,
         seq_len: Optional[int] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        tokens_per_second: Optional[int] = None,
     ) -> Tuple[List[List[int]], int]:
-        """Get mrope input positions and delta value."""
+        """
+        Get mrope input positions and delta value.
+        :arg
+            second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
+                The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+        """
         if isinstance(image_grid_thw, torch.Tensor):
             image_grid_thw = image_grid_thw.tolist()
@@ -917,6 +872,7 @@ class MRotaryEmbedding(RotaryEmbedding):
                 )
                 image_index += 1
                 remain_images -= 1
+                second_per_grid_t = 0
                 ed = ed_image
             else:
                 t, h, w = (
@@ -924,6 +880,10 @@ class MRotaryEmbedding(RotaryEmbedding):
                     video_grid_thw[video_index][1],
                     video_grid_thw[video_index][2],
                 )
+                if second_per_grid_ts is not None:
+                    second_per_grid_t = second_per_grid_ts[video_index]
+                else:
+                    second_per_grid_t = 1.0
                 video_index += 1
                 remain_videos -= 1
                 ed = ed_video
@@ -940,11 +900,11 @@ class MRotaryEmbedding(RotaryEmbedding):
             )
             t_index = (
-                torch.arange(llm_grid_t)
-                .view(-1, 1)
-                .expand(-1, llm_grid_h * llm_grid_w)
-                .flatten()
-            )
+                torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
+                * second_per_grid_t
+                * tokens_per_second
+            ).flatten()
             h_index = (
                 torch.arange(llm_grid_h)
                 .view(1, -1, 1)
@@ -1172,6 +1132,37 @@ def get_rope(
     return rotary_emb
+# Copied from transformers
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim=1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    # embedding is performed in float
+    cos = cos.unsqueeze(unsqueeze_dim).float()
+    sin = sin.unsqueeze(unsqueeze_dim).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
 def get_rope_cpu(
     head_size: int,
     rotary_dim: int,

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -168,7 +168,7 @@ class Sampler(nn.Module):
                 group=self.tp_sync_group,
             )
-        return batch_next_token_ids.to(torch.int32)
+        return batch_next_token_ids
     def _apply_custom_logit_processor(
         self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo

sglang/srt/lora/backend/base_backend.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from sglang.srt.lora.utils import LoRABatchInfo
-def get_fuse_output_scaling_add_from_name(name: str) -> bool:
+def get_fuse_output_add_from_name(name: str) -> bool:
     mapping = {
         "triton": True,
         "flashinfer": False,
@@ -28,14 +28,14 @@ class BaseLoRABackend:
     Args:
         name: name of backend
         batch_info: information of current batch for use
-        fuse_output_scaling_add: if set to True, the output buffer for storing result will be passed in when doing lora_b forward,
-                                 and the operation of scaling and adding will be fused into kernel
+        fuse_output_add: if set to True, the output buffer for storing result will be passed in when doing lora_b forward,
+                                 and the operation of adding will be fused into kernel
     """
     def __init__(self, name: str, batch_info: LoRABatchInfo = None):
         self.name = name
         self.batch_info = batch_info
-        self.fuse_output_scaling_add = get_fuse_output_scaling_add_from_name(name)
+        self.fuse_output_add = get_fuse_output_add_from_name(name)
         self.fuse_stacked_lora_b = get_fuse_stacked_lora_b_from_name(name)
     def run_lora_a_sgemm(

sglang/srt/lora/backend/flashinfer_backend.py CHANGED Viewed

@@ -37,13 +37,16 @@ class FlashInferLoRABackend(BaseLoRABackend):
         self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
     ) -> torch.Tensor:
-        return self.segment_gemm.run(
-            x=x,
-            weights=weights,
-            batch_size=self.batch_info.bs,
-            weight_column_major=True,
-            seg_indptr=self.batch_info.seg_indptr,
-            weight_indices=self.batch_info.weight_indices,
+        return (
+            self.segment_gemm.run(
+                x=x,
+                weights=weights,
+                batch_size=self.batch_info.bs,
+                weight_column_major=True,
+                seg_indptr=self.batch_info.seg_indptr,
+                weight_indices=self.batch_info.weight_indices,
+            )
+            * self.batch_info.scalings[0]
         )
     def run_qkv_lora(
@@ -90,7 +93,7 @@ class FlashInferLoRABackend(BaseLoRABackend):
             weights=kv_lora_b[1],
         )
-        return lora_output
+        return lora_output * self.batch_info.scalings[0]
     def run_gate_up_lora(
         self,
@@ -125,4 +128,4 @@ class FlashInferLoRABackend(BaseLoRABackend):
             weights=gate_up_lora_b[1],
         )
-        return lora_output
+        return lora_output * self.batch_info.scalings[0]

sglang/srt/lora/backend/triton_backend.py CHANGED Viewed

@@ -25,11 +25,10 @@ class TritonLoRABackend(BaseLoRABackend):
         x: torch.Tensor,
         weights: torch.Tensor,
         base_output: torch.Tensor = None,
-        scaling: float = 1.0,
         *args,
         **kwargs
     ) -> torch.Tensor:
-        return sgemm_lora_b_fwd(x, weights, self.batch_info, base_output, scaling)
+        return sgemm_lora_b_fwd(x, weights, self.batch_info, base_output)
     def run_qkv_lora(
         self,
@@ -39,7 +38,6 @@ class TritonLoRABackend(BaseLoRABackend):
         output_offset: torch.Tensor,
         max_qkv_out_dim: int,
         base_output: torch.Tensor = None,
-        scaling: float = 1.0,
         *args,
         **kwargs
     ) -> torch.Tensor:
@@ -49,7 +47,7 @@ class TritonLoRABackend(BaseLoRABackend):
         # qkv_lora_b: (num_lora, output_dim_q + 2 * output_dim_kv, r)
         assert isinstance(qkv_lora_b, torch.Tensor)
-        lora_a_output = sgemm_lora_a_fwd(x, qkv_lora_a, self.batch_info)
+        lora_a_output = sgemm_lora_a_fwd(x, qkv_lora_a, self.batch_info, stack_num=3)
         lora_output = qkv_lora_b_fwd(
             lora_a_output,
             qkv_lora_b,
@@ -57,7 +55,6 @@ class TritonLoRABackend(BaseLoRABackend):
             output_offset,
             max_qkv_out_dim,
             base_output,
-            scaling,
         )
         return lora_output
@@ -67,7 +64,6 @@ class TritonLoRABackend(BaseLoRABackend):
         gate_up_lora_a: torch.Tensor,
         gate_up_lora_b: torch.Tensor,
         base_output: torch.Tensor = None,
-        scaling: float = 1.0,
         *args,
         **kwargs
     ) -> torch.Tensor:
@@ -79,13 +75,14 @@ class TritonLoRABackend(BaseLoRABackend):
         output_dim = gate_up_lora_b.shape[-2] // 2
         # lora_a_output: (s, 2 * r)
-        lora_a_output = sgemm_lora_a_fwd(x, gate_up_lora_a, self.batch_info)
+        lora_a_output = sgemm_lora_a_fwd(
+            x, gate_up_lora_a, self.batch_info, stack_num=2
+        )
         lora_output = gate_up_lora_b_fwd(
             lora_a_output,
             gate_up_lora_b,
             self.batch_info,
             output_dim,
             base_output,
-            scaling,
         )
         return lora_output

sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post3py3-none-any.whl